From 400059dd651580c48294e6f69b0a73f37952e05d Mon Sep 17 00:00:00 2001 From: Joonas Miettinen Date: Wed, 16 May 2018 15:34:48 +0000 Subject: [PATCH] version 0.4.5 --- DESCRIPTION | 20 +- MD5 | 264 +- NAMESPACE | 176 +- NEWS.md | 274 +- R/S3_definitions.R | 2946 ++++++++--------- R/aggregating.R | 1232 +++---- R/data_document.R | 348 +- R/direct_adjusting.R | 314 +- R/evaluation.R | 1914 +++++------ R/flexyargs.R | 332 +- R/fractional_years.R | 300 +- R/incidence_rates.R | 689 ++-- R/incidence_rates_utils.R | 266 +- R/lexpand.R | 1880 +++++------ R/lexpand2.R | 278 +- R/lifetime_function.R | 564 ++-- R/ltable.R | 502 +-- R/mean_survival.R | 1326 ++++---- R/popEpi_package.r | 60 +- R/pophaz.R | 136 +- R/prevalence.R | 282 +- R/relative_poisson.R | 1220 +++---- R/relative_poisson_net_survival.R | 352 +- R/sir.R | 2900 ++++++++-------- R/sir_utils.R | 226 +- R/splitLexisDT.R | 602 ++-- R/splitMulti.R | 538 +-- R/splitting_utility_functions.R | 2852 ++++++++-------- R/startup_message.R | 85 +- R/survival_aggregated.R | 1780 +++++----- R/survival_lexis.R | 1188 +++---- R/survival_utility_functions.R | 1276 +++---- R/utility_functions.R | 2872 ++++++++-------- R/weighted_table.R | 1326 ++++---- README.md | 378 +-- build/vignette.rds | Bin 254 -> 253 bytes inst/doc/sir.R | 128 +- inst/doc/sir.Rmd | 318 +- inst/doc/sir.html | 600 ++-- inst/doc/survtab_examples.R | 340 +- inst/doc/survtab_examples.Rmd | 592 ++-- inst/doc/survtab_examples.html | 822 ++--- man/ICSS.Rd | 72 +- man/Lexis_fpa.Rd | 164 +- man/RPL.Rd | 42 +- man/adjust.Rd | 48 +- man/aggre.Rd | 370 +-- man/all_names_present.Rd | 62 +- man/as.Date.yrs.Rd | 84 +- man/as.aggre.Rd | 152 +- man/as.data.frame.ratetable.Rd | 66 +- man/as.data.table.ratetable.Rd | 68 +- man/cast_simple.Rd | 100 +- man/cut_bound.Rd | 56 +- man/direct_standardization.Rd | 308 +- man/fac2num.Rd | 74 +- man/flexible_argument.Rd | 334 +- man/get.yrs.Rd | 156 +- man/is.Date.Rd | 82 +- man/is_leap_year.Rd | 56 +- man/lexpand.Rd | 736 ++-- man/lines.sirspline.Rd | 84 +- man/lines.survmean.Rd | 70 +- man/lines.survtab.Rd | 120 +- man/longDF2ratetable.Rd | 60 +- man/lower_bound.Rd | 36 +- man/ltable.Rd | 290 +- man/makeWeightsDT.Rd | 306 +- man/meanpop_fi.Rd | 58 +- man/na2zero.Rd | 54 +- man/plot.rate.Rd | 66 +- man/plot.sir.Rd | 140 +- man/plot.sirspline.Rd | 92 +- man/plot.survmean.Rd | 76 +- man/plot.survtab.Rd | 126 +- man/poisson.ci.Rd | 52 +- man/popEpi.Rd | 68 +- man/pophaz.Rd | 114 +- man/popmort.Rd | 56 +- man/prepExpo.Rd | 210 +- man/print.aggre.Rd | 54 +- man/print.rate.Rd | 48 +- man/print.survtab.Rd | 66 +- man/rate.Rd | 212 +- man/rate_ratio.Rd | 140 +- man/relpois.Rd | 234 +- man/relpois_ag.Rd | 188 +- man/robust_values.Rd | 90 +- man/rpcurve.Rd | 142 +- man/setaggre.Rd | 104 +- man/setclass.Rd | 54 +- man/setcolsnull.Rd | 64 +- man/sibr.Rd | 76 +- man/sir.Rd | 388 +-- man/sir_exp.Rd | 204 +- man/sir_ratio.Rd | 162 +- man/sire.Rd | 74 +- man/sirspline.Rd | 282 +- man/splitLexisDT.Rd | 176 +- man/splitMulti.Rd | 264 +- man/stdpop101.Rd | 54 +- man/stdpop18.Rd | 58 +- man/summary.aggre.Rd | 64 +- man/summary.survtab.Rd | 188 +- man/survmean.Rd | 560 ++-- man/survtab.Rd | 616 ++-- man/survtab_ag.Rd | 782 ++--- man/try2int.Rd | 54 +- tests/testthat.R | 17 +- tests/testthat/test_aggre.R | 454 +-- tests/testthat/test_epi.R | 68 +- tests/testthat/test_expo.R | 94 +- tests/testthat/test_lexpand.R | 744 ++--- tests/testthat/test_prevtab.R | 40 +- tests/testthat/test_rate.R | 624 ++-- tests/testthat/test_relpois_mean_curve.R | 136 +- tests/testthat/test_sir.R | 882 ++--- tests/testthat/test_splitLexisDT.R | 256 +- tests/testthat/test_splitMulti.R | 286 +- tests/testthat/test_splitting_attributes.R | 188 +- tests/testthat/test_splitting_breaks.R | 160 +- ...> test_splitting_randomly_on_fixed_data.R} | 280 +- .../test_splitting_randomly_on_random_data.R | 118 +- tests/testthat/test_survmean.R | 604 ++-- tests/testthat/test_survtab_adjusted.R | 152 +- tests/testthat/test_survtab_bad_surv_ints.R | 190 +- tests/testthat/test_survtab_observed.R | 275 +- tests/testthat/test_survtab_relative.R | 396 +-- tests/testthat/test_survtab_usage.R | 960 +++--- tests/testthat/test_utils.R | 834 ++--- tests/testthat/test_weighter.R | 278 +- vignettes/sir.Rmd | 318 +- vignettes/survtab_examples.Rmd | 592 ++-- 133 files changed, 27021 insertions(+), 26929 deletions(-) rename tests/testthat/{test_splitting_randomly.R => test_splitting_randomly_on_fixed_data.R} (95%) diff --git a/DESCRIPTION b/DESCRIPTION index 347a2f5..f7ebe50 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,15 +1,15 @@ Package: popEpi Title: Functions for Epidemiological Analysis using Population Data Authors@R: c(person("Joonas", "Miettinen", , "joonas.miettinen@cancer.fi", c("aut", "cre")), person("Matti", "Rantanen", , "matti.rantanen@cancer.fi", "aut"), person("Karri", "Seppa", , "karri.seppa@cancer.fi", "ctb") ) -Version: 0.4.4 -Date: 2017-11-13 +Version: 0.4.5 +Date: 2018-05-16 Maintainer: Joonas Miettinen -Description: Enables computation of epidemiological statistics where e.g. - counts or mortality rates of the reference population are used. Currently - supported: excess hazard models, rates, mean survival times, relative - survival, as well as standardized incidence and mortality ratios (SIRs/SMRs), - all of which can be easily adjusted for e.g. age. - Fast splitting and aggregation of 'Lexis' objects (from package 'Epi') +Description: Enables computation of epidemiological statistics, including those + where counts or mortality rates of the reference population are used. + Currently supported: excess hazard models, rates, mean survival times, + relative survival, and standardized incidence and mortality ratios + (SIRs/SMRs), all of which can be easily adjusted for by covariates such as + age. Fast splitting and aggregation of 'Lexis' objects (from package 'Epi') and other computations achieved using 'data.table'. Depends: R (>= 3.2.0) Imports: Epi (>= 2.0), survival, data.table (>= 1.10.4) @@ -24,9 +24,9 @@ URL: https://github.com/WetRobot/popEpi BugReports: https://github.com/WetRobot/popEpi/issues RoxygenNote: 6.0.1 NeedsCompilation: no -Packaged: 2017-11-14 13:55:10 UTC; joonas.miettinen +Packaged: 2018-05-16 15:46:54 UTC; jonttu Author: Joonas Miettinen [aut, cre], Matti Rantanen [aut], Karri Seppa [ctb] Repository: CRAN -Date/Publication: 2017-11-14 14:11:03 UTC +Date/Publication: 2018-05-16 16:34:48 UTC diff --git a/MD5 b/MD5 index a205357..13463b3 100644 --- a/MD5 +++ b/MD5 @@ -1,38 +1,38 @@ -88996f89f16ed90f83c7bf464a7f7afc *DESCRIPTION -7377f37dc347ef5f2554779bc05caa3b *NAMESPACE -af8f6314def8661ffde13cceab7db510 *NEWS.md -c7e1bad912142d192529a39dad313e51 *R/S3_definitions.R -ec1e1810f100d1ad91e5950e34757bf3 *R/aggregating.R -fba753ed3e19e744850cb0e262c7c731 *R/data_document.R -3480dd3a7e2dfa5af2868773c405bbe3 *R/direct_adjusting.R -263742508f2fe4165daf0ed94af9f582 *R/evaluation.R -36d60d53afba43bc7b11fb23c17c40eb *R/flexyargs.R -32cf6baba19b7117d59615abd81fbe9f *R/fractional_years.R -4971aa7f5ed4bb0eb5dd30a03e1cc5a8 *R/incidence_rates.R -b0d57079d0ad4539ae5835093a1601c1 *R/incidence_rates_utils.R -9fa6844b67250943d3132964aad53232 *R/lexpand.R -f1d283541c1fa9440877e06ef319e7e1 *R/lexpand2.R -893632dc07c7bcdba9a91aa6caf20fd8 *R/lifetime_function.R -3c75e839569f3f4ec4796722f3e03fd7 *R/ltable.R -ae56b0e115c04a2a3e2d264fdc361e35 *R/mean_survival.R -c2cdff13c7478ee86ba9353a4bb40b33 *R/popEpi_package.r -6874c2fce49b449509a0bf2d941b3839 *R/pophaz.R -8d8745f9909979dd661aa7002119832e *R/prevalence.R -d2ed5b8359044a4025ce4527f71a120c *R/relative_poisson.R -81237a9ebe8c3cca90242b5cdadc1ca6 *R/relative_poisson_net_survival.R -640b4999312621ba0dcca20119541e2f *R/sir.R -a9de54ea10686a97b485640dcd5cc181 *R/sir_utils.R -3672931fdd079b2b7bc64d9c009e116a *R/splitLexisDT.R -14d72d62d606c4a16358ed7b565a7a32 *R/splitMulti.R -5b63ef31edef144e2c48421e883f470e *R/splitting_utility_functions.R -8b8be62fe06781ca6881fe6fb80305d5 *R/startup_message.R -77853d10c386207b26e4160f8cc7658b *R/survival_aggregated.R -928de195bfec8adaee13e0ffdbad9f42 *R/survival_lexis.R -db004cb5ff1392f094fe2bd09a740dd0 *R/survival_utility_functions.R -3e335053d5803ec13afad83099278b29 *R/utility_functions.R -17bbead814dfe92cd39181c2b6f68512 *R/weighted_table.R -8150b56c6fb514b6cb6a126ffb77adb1 *README.md -46f1e81d32a46459e31767296e3f1591 *build/vignette.rds +34fbad3ff69d9469f3a9576d9e185407 *DESCRIPTION +27263242892a3f9807c5a7a888f9309d *NAMESPACE +0ec36f152ec0721911d8ed9089467ce9 *NEWS.md +c34d39ee94d4e7447214312affb2776f *R/S3_definitions.R +f7d9868f5c11eab306686f44f09a9c7a *R/aggregating.R +e8e715a351e33bc57954b8f09848bd0d *R/data_document.R +eba9772a257e2d415084f87b6ac4957a *R/direct_adjusting.R +baec4fa9cc5739782b98bf72dfc48dc9 *R/evaluation.R +b4e79acca5607b0f247210727fbdbca0 *R/flexyargs.R +d26f39e6251cb08c3f57be95a4667e00 *R/fractional_years.R +4e328257e6dbc96a1c138e8e5736ce5f *R/incidence_rates.R +0a476abc9bbee4c78c54f00c56a18b06 *R/incidence_rates_utils.R +bff433f7934cd9440bda97c1d6c50548 *R/lexpand.R +c71593734089c81b50a128e6479ee080 *R/lexpand2.R +d3f6be350a4cc7999410eb4f1781b0c8 *R/lifetime_function.R +6ca3684cfe9e8facd8baec0a6bb36b6f *R/ltable.R +68c5f3aa44d3e3d68fed8e5397cb841d *R/mean_survival.R +98bc3776d8a628647a37fb8df7114671 *R/popEpi_package.r +9e53e964f1187d4e73e977554227a5e8 *R/pophaz.R +c8d103b77099583dbb00cc1950c41add *R/prevalence.R +37071d33e823c9a216dee22d1abfd40e *R/relative_poisson.R +078f34d5f7d8cb3bdaf46cf6301580d0 *R/relative_poisson_net_survival.R +6b211444e5b21cf6fdfb193858d95f36 *R/sir.R +7e48e22b0a15778e25064d8038763953 *R/sir_utils.R +fe49f4da6e65a5aac875f224fa235f46 *R/splitLexisDT.R +19e01462e2afc1c8ff52d5f3c38ab78c *R/splitMulti.R +047694cc20a0a58016a58d11a307f106 *R/splitting_utility_functions.R +07147c047f9850041f71496a3c38a2bb *R/startup_message.R +bf2016d51a9c968f8d317ad4a2915979 *R/survival_aggregated.R +0db689f206d232fee6eb302b90a5a1a4 *R/survival_lexis.R +f64240d4c2ac187fa251077a790dc751 *R/survival_utility_functions.R +e6edad78126f6f148a71271f7f33221c *R/utility_functions.R +2e0fc60c25f2c6358abc4662fd34c3c9 *R/weighted_table.R +7976fcc448a8505f4c1825b0310bd265 *README.md +31c6036bf02fea0609173e12f1c03504 *build/vignette.rds 42a2e450ebc81d8cf32d7eb9f3efc406 *data/ICSS.rdata d15a881aeee84aa9307933f84acbe2ce *data/meanpop_fi.rda ba6bcd1044f5a25c868742359b4338ab *data/popmort.rdata @@ -40,101 +40,101 @@ ba6bcd1044f5a25c868742359b4338ab *data/popmort.rdata acb6b4afdc41b51495ca354054899b43 *data/sire.rdata 1f3ba42ee14403d7e17eff9ee19d9adc *data/stdpop101.rdata 905a241496ce00690f2d227753b62364 *data/stdpop18.rdata -081b75d6bd45bc8d26be991e92d45a35 *inst/doc/sir.R -5ad9b401e933a5d91042ee2d91587e0e *inst/doc/sir.Rmd -d25fdf10033c7fe5d5c099796e4d0e4c *inst/doc/sir.html -cb16b22c5cd3f3097df09cd996066d4a *inst/doc/survtab_examples.R -61b12a8efbbc09f71d8b540158c39c2f *inst/doc/survtab_examples.Rmd -3e2464ea0fd2cb01c237f3d3a48b1ad6 *inst/doc/survtab_examples.html -0f19c638d2a9d05463047c95ff52898f *man/ICSS.Rd -aec0efb3cbfc234dc7b720c4c193d60f *man/Lexis_fpa.Rd -fb13276c4a6b3e9efb569e93ef7f46e4 *man/RPL.Rd -b0ec6686fe92c944616bebdc5a5274ff *man/adjust.Rd -b098097f49917ff36c26ae559a4b7f50 *man/aggre.Rd -3a2237e672a4b02d3f952218c052fff5 *man/all_names_present.Rd -667b933495f6889c8f1f9656195ec17b *man/as.Date.yrs.Rd -53da2db1467b037147fdc05c502ee6aa *man/as.aggre.Rd -57a90a39af1780778899c138f108fd1d *man/as.data.frame.ratetable.Rd -ee70bbc45a191458824500a64644f56d *man/as.data.table.ratetable.Rd -aab67f49ac120f8d1f1ddacdbd666105 *man/cast_simple.Rd -57cde2bcb0678a36dc50b9f3f01eba9b *man/cut_bound.Rd -2cfb3ad91b6ec5336704646a5c17fddb *man/direct_standardization.Rd -13bd12997beffd0731de32490fba3f1a *man/fac2num.Rd -90bccc1ef0ade37f66692e4473b1507a *man/flexible_argument.Rd -d39cd3e3ff3f031f0e940edce5331922 *man/get.yrs.Rd -eb4e9eb2584d78d3c442b9bf77d8fc3b *man/is.Date.Rd -98e08ddab0fdd0eb2bc909c0f2f5ea9e *man/is_leap_year.Rd -5485ffaa26c767ab94e3640dc89abfeb *man/lexpand.Rd -771fb2b89af650732dccbdde5f96abb5 *man/lines.sirspline.Rd -6f6ed53f1071466066c3fa3c07a9e06d *man/lines.survmean.Rd -e2f346f1827c5bbd220419305c9e3413 *man/lines.survtab.Rd -b74799647f40934492fc87b4938b0b94 *man/longDF2ratetable.Rd -adcc4ee9a8cd057b3ba409a393ab19f8 *man/lower_bound.Rd -05029a01637822313af38d8b7d41702b *man/ltable.Rd -5db42889b01b735faa28175620120f9a *man/makeWeightsDT.Rd -49b4c61e5369af6275e14fbee863d9b0 *man/meanpop_fi.Rd -20fb7bffb747c66bf96f55fcfd8a0f1a *man/na2zero.Rd -11bd3bfde02197549d90de4157f5e476 *man/plot.rate.Rd -2ef378a327bb77ada1dc425af5fe9864 *man/plot.sir.Rd -3195d385ccabf523103f7f8995fb62e7 *man/plot.sirspline.Rd -ae6db884a154aa9667ce931fdf7ae587 *man/plot.survmean.Rd -8fec28c3bee73b314090f3249ef52cb4 *man/plot.survtab.Rd -a109514e9a208c8e4194eb5f2f0bb497 *man/poisson.ci.Rd -47b9ed9d3c926d47f5ffd878ccb48619 *man/popEpi.Rd -38f62e4f3d1258c8b58b12185e7da5b2 *man/pophaz.Rd -8e5cad26e16d9a731294e6533fc0b900 *man/popmort.Rd -7774f903ef4c13898243846fc9fc9183 *man/prepExpo.Rd -7abbadbaf3866e89549d7007b7193662 *man/print.aggre.Rd -81662ddab28ed211ebcb30dd63b5cf8b *man/print.rate.Rd -fbf128bbb137155913a27f8832b7bf96 *man/print.survtab.Rd -cbc355a087325a38a37b6a969f0f47c5 *man/rate.Rd -5a337287eb168e1affa139208c64378d *man/rate_ratio.Rd -987f0b3724fca7209ce788b9e06b51c9 *man/relpois.Rd -2c65ce17362e70767fec678bb9df5cd4 *man/relpois_ag.Rd -e3eaa3491304021cc2eb8ba0f7a440e3 *man/robust_values.Rd -8189e1472f999e79d8024e047b9acd2d *man/rpcurve.Rd -49dcdeac39d0868d225725fd6fd1c183 *man/setaggre.Rd -5a97ddc2e9b68a65552e70c3eb88ef67 *man/setclass.Rd -d53a6105a338b273dff944a19318d802 *man/setcolsnull.Rd -c3b5bb4a7bed1430d1044cb79a0e5ca8 *man/sibr.Rd -a22175df4a47c870a589d1c32d85f23e *man/sir.Rd -10c6073d49188d029f872125a5ba50b7 *man/sir_exp.Rd -d2be6abc60482bce3ec858d08fcc4e73 *man/sir_ratio.Rd -c9bc5ccf40e31fcbc7e13dca17d205be *man/sire.Rd -adca761fd18b80f8da3c11b575485f4f *man/sirspline.Rd -01ff4836a79c563144f132f558affbe8 *man/splitLexisDT.Rd -926e963f7f0945d210ff6216e95eeb3a *man/splitMulti.Rd -1d73a30273d1ee08abdc9804b7dc944a *man/stdpop101.Rd -64991b75809cbcdc4e6f7ebcd51c27f7 *man/stdpop18.Rd -75b96f6986d47547cdc467032cbf37f7 *man/summary.aggre.Rd -99a98377211fe2d6e66e6b9b1045b827 *man/summary.survtab.Rd -3b6c214e0438b8071c1baf03c799685d *man/survmean.Rd -ee7b01f71161eaefd6dedb064882437b *man/survtab.Rd -993f83db874f5deb8858608cb5a06389 *man/survtab_ag.Rd -d85caead76788a23b6cafc64569e8023 *man/try2int.Rd -ac4d260aa86e78e003b2935c1332d024 *tests/testthat.R +0b1d02ff7085919ed70aecf2f2b6e921 *inst/doc/sir.R +fdf79aec5bef33d90c856f410e63cb49 *inst/doc/sir.Rmd +0c74686536daab7bd43ba27dadee4015 *inst/doc/sir.html +9af4946d1a5c3a7ac308dcea22695ec1 *inst/doc/survtab_examples.R +d2a16b2d8d107070ffbb239b10b342e9 *inst/doc/survtab_examples.Rmd +d707e56368dfffdfecb1f2126c2abc81 *inst/doc/survtab_examples.html +20adf8b36ed788d6b5289460139d8a89 *man/ICSS.Rd +2bba8aabe1b952efc74b4e8efec4a95b *man/Lexis_fpa.Rd +1a4bf32e6fd885e104635693f6a1a6d5 *man/RPL.Rd +00402d7247bbdb2af0e5ca304c8c8d9f *man/adjust.Rd +6786e47d773d271fe047d8111d6d6aaa *man/aggre.Rd +f979e54de33463c40e10415121f88088 *man/all_names_present.Rd +b70a819c0f63b0c6f48ad4e4e11b2020 *man/as.Date.yrs.Rd +a0983732b1ddc855c2014bf15f3830bc *man/as.aggre.Rd +7bc129bafd2c9bbe06c1c11b46b833bc *man/as.data.frame.ratetable.Rd +2a74904eaec9696f98edf2c4ecff0eb2 *man/as.data.table.ratetable.Rd +39129711a2ba52b321c0da0e840eb0b4 *man/cast_simple.Rd +a2dfb8d92a41fbbd6790cd24f17f20c8 *man/cut_bound.Rd +7ba8dd4bdcff077d9f1398bef9a66279 *man/direct_standardization.Rd +0af861129aa176a8e51f608a1dcf70eb *man/fac2num.Rd +f0c20ceeb373213f333b4b2c2c334e64 *man/flexible_argument.Rd +0067d72019be8ff848305641743ba4ad *man/get.yrs.Rd +ecd5fe5a35e1eb11780014ed4ef111de *man/is.Date.Rd +14f56f0dbba47255c039786124149156 *man/is_leap_year.Rd +e07a2af7d2672a22f7aceb72cf86cff8 *man/lexpand.Rd +e445105a352cd364dcb73792374f49b5 *man/lines.sirspline.Rd +8895756b1d98c3fadebc9a6cd5ca9de6 *man/lines.survmean.Rd +fd6f23a8a2760dfe51d4e88de3605a0b *man/lines.survtab.Rd +7fc614ffab8e71303a63998c19eaf1f6 *man/longDF2ratetable.Rd +ed2d5b82a071adebcbf95ed4f2e5ea8c *man/lower_bound.Rd +ebb67dba9fe4612d5c28d3e05ffac0d9 *man/ltable.Rd +fd5b6255485d6127c005701bc9481233 *man/makeWeightsDT.Rd +35debeebcb0a4957ce4f3a623bb63849 *man/meanpop_fi.Rd +557353a304e4d54021c1274a42671261 *man/na2zero.Rd +13aa9aa1b6871879820fc01ce3895195 *man/plot.rate.Rd +bb265b16144b849ff3d8c3f0dcb0adea *man/plot.sir.Rd +9ed4933ea45ec4954fe68e480728ad4f *man/plot.sirspline.Rd +343551bf108acd6f475663ee94960c39 *man/plot.survmean.Rd +a75543f828a2381aafe92bccd2385ecb *man/plot.survtab.Rd +2099e25fea2912d8f61139c37704e704 *man/poisson.ci.Rd +ad955572f8fe53d897a0f142289994dd *man/popEpi.Rd +a995357501a57046ffc2482a98bc39c0 *man/pophaz.Rd +bec013abd59aaf2f67bdf9fa281a7c6a *man/popmort.Rd +f738731fd6e77d4334c6cee849222e67 *man/prepExpo.Rd +c9e8b514fc52cdd8f0c354582b191b3a *man/print.aggre.Rd +ded2fcb5f9ec96d6944f92bb97f9a462 *man/print.rate.Rd +de1d8a8001b48b023a7b1b7ac8b77daf *man/print.survtab.Rd +c882859b89d4b001b522bf702fef122c *man/rate.Rd +09a89ab991ca47c4a9b7d31cfce1a6f6 *man/rate_ratio.Rd +38f210cc6c0f53f4a58c30b80ba78c71 *man/relpois.Rd +f1bd1fa7f5d8cda2f3e114b86a222286 *man/relpois_ag.Rd +03e4b0d6f77dbfa0fba31f2cacc8e6bc *man/robust_values.Rd +ca4edab42f1b04e921817abee82bb01b *man/rpcurve.Rd +74cc3dd0e81322e5799ed05855f0b53e *man/setaggre.Rd +f3c0b28a3eadaf693f69a738ac6c0e73 *man/setclass.Rd +eb4abe83adbc5d7e2f8880d84599aa19 *man/setcolsnull.Rd +3b18ffa0babf0db5d800e28962d69ea8 *man/sibr.Rd +3d6cb15ff5d66ce23e665071bde152bd *man/sir.Rd +202feec327426f3289c3a4a7acd56a0e *man/sir_exp.Rd +e5e62cf263f405aadf3794952b355d95 *man/sir_ratio.Rd +c68fdc2aa4d1689eece369f99b51aa79 *man/sire.Rd +ef15fc5591653f9ee85c682ccf562e21 *man/sirspline.Rd +ecbbee61289be25a9286c5650dd4f990 *man/splitLexisDT.Rd +a75668bf4c4168e1b14d2d7f77a23eb0 *man/splitMulti.Rd +3e345f655c8eef0a3084ad322c6173fb *man/stdpop101.Rd +cb8e9842a1a978769d61501cf0bc1f3f *man/stdpop18.Rd +2634f859e0f2144a6658e0ff7f824ce6 *man/summary.aggre.Rd +28b2a6add6f89fc999fcaf0e69ca4955 *man/summary.survtab.Rd +dd78a1be0223a6be64e4a3291fcfe389 *man/survmean.Rd +84cfdfb895dde301fe43e8fc3117ed8d *man/survtab.Rd +e59b41ae99b8b8a6f70773f6b353e6eb *man/survtab_ag.Rd +02120d9702b0ff70ed4350f862c1fad8 *man/try2int.Rd +ab21623129423236103aa51cd8dff058 *tests/testthat.R 7649bcb74c39a065e146c7af95db3bdf *tests/testthat/survmean_test_data_01.rds -855c5c777ee0905a9e612ccb91ae3e89 *tests/testthat/test_aggre.R -0622cf4f4e8f35e88140eac8f790a0c2 *tests/testthat/test_epi.R -3096d2559f21255607a0f6aa3b5ed031 *tests/testthat/test_expo.R -70352511a55a0215d796971f69aa5b92 *tests/testthat/test_lexpand.R -36928149b0def9f1711b98c40d9edb3c *tests/testthat/test_prevtab.R -ea050176ed549a6a2c61324fcb03b466 *tests/testthat/test_rate.R -b11fe649900ae75671e4d1e65f58408b *tests/testthat/test_relpois_mean_curve.R -ef2e6e6b33a2021f65e02da97f7451da *tests/testthat/test_sir.R -494ab5cefe34b8e6652934cc24ab2677 *tests/testthat/test_splitLexisDT.R -7586a6b06d23b8cc48cfefe1ccc61662 *tests/testthat/test_splitMulti.R -0107dbf7ba57b6124deeef964a071ac8 *tests/testthat/test_splitting_attributes.R -375cdb7f3a5160185f05338e5c6d9138 *tests/testthat/test_splitting_breaks.R -32c741a6f16bc0e0403f0282f040f25c *tests/testthat/test_splitting_randomly.R -6a2f1d9417f4919c9f281b1d0e0f2f44 *tests/testthat/test_splitting_randomly_on_random_data.R -1c286e755f6c1e2cf8db5e23a98dcf9c *tests/testthat/test_survmean.R -9cdee8949d62e6a6f76826498462bd3f *tests/testthat/test_survtab_adjusted.R -631c3f750169dda6c0e88f8fdc08bfdc *tests/testthat/test_survtab_bad_surv_ints.R -c81e9537eea06aec8ec2c0363a869dca *tests/testthat/test_survtab_observed.R -2b9df24b60193b67dc8ee793c6b37831 *tests/testthat/test_survtab_relative.R -4887f136d13240e9db8bbfd020045b0a *tests/testthat/test_survtab_usage.R -11fe010a97a4743358fc67c11fba7b72 *tests/testthat/test_utils.R -a887c32ed4af0f346f879e5b2c6a0e26 *tests/testthat/test_weighter.R -5ad9b401e933a5d91042ee2d91587e0e *vignettes/sir.Rmd -61b12a8efbbc09f71d8b540158c39c2f *vignettes/survtab_examples.Rmd +1626a827ec1591fc618fa7b3873ac59c *tests/testthat/test_aggre.R +184c67299b924b377924422eb2b67b8f *tests/testthat/test_epi.R +1d3ac19cc4acc3797a4b5dd123c92415 *tests/testthat/test_expo.R +d56122affc8c978903937f979e5f0352 *tests/testthat/test_lexpand.R +ff1b9cb4721f71ac8082b99f29eb0e90 *tests/testthat/test_prevtab.R +8e1a7f5d0d79ac2eb1a07d6de0d311d2 *tests/testthat/test_rate.R +487e78041a03ada434cdd18bc2c6bd3f *tests/testthat/test_relpois_mean_curve.R +bce08fc9b798d983d4a4c62edc9ad313 *tests/testthat/test_sir.R +425d8220fc901331e0ca47e377932e72 *tests/testthat/test_splitLexisDT.R +1ac43ca0f70d46641efe564e4a88307f *tests/testthat/test_splitMulti.R +13ef85ca793408221861ed0c431230e1 *tests/testthat/test_splitting_attributes.R +f5252e245cb34ef3e7fa7a2bd2e0dd4e *tests/testthat/test_splitting_breaks.R +11f9abe161fad1610dbbee668b403aec *tests/testthat/test_splitting_randomly_on_fixed_data.R +ec8db969b4d75f62deb58b62a2dfa92f *tests/testthat/test_splitting_randomly_on_random_data.R +90e108a92926a4841accf61a9dbbacb2 *tests/testthat/test_survmean.R +75804df8000acf42a335b1980caee336 *tests/testthat/test_survtab_adjusted.R +1ad23142b1953104ec794aa29786370e *tests/testthat/test_survtab_bad_surv_ints.R +25de0d1fcab582878f4b9c8e35095adc *tests/testthat/test_survtab_observed.R +5795bc45afee96c7097e03d90cf0b8fe *tests/testthat/test_survtab_relative.R +b512a6be87fa24b76e939308bb377063 *tests/testthat/test_survtab_usage.R +1d9d2c6b52cb3db26c8145faf524bd00 *tests/testthat/test_utils.R +b86120fb9d29ad4c738a31997d2e2782 *tests/testthat/test_weighter.R +fdf79aec5bef33d90c856f410e63cb49 *vignettes/sir.Rmd +d2a16b2d8d107070ffbb239b10b342e9 *vignettes/survtab_examples.Rmd diff --git a/NAMESPACE b/NAMESPACE index cd9a465..fe2a6a6 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,88 +1,88 @@ -# Generated by roxygen2: do not edit by hand - -S3method("[",aggre) -S3method("[",rate) -S3method("[",sir) -S3method("[",survmean) -S3method("[",survtab) -S3method("[",yrs) -S3method(as.Date,yrs) -S3method(as.aggre,data.frame) -S3method(as.aggre,data.table) -S3method(as.aggre,default) -S3method(as.data.frame,ratetable) -S3method(as.data.table,ratetable) -S3method(coef,sir) -S3method(confint,sir) -S3method(formula,survmean) -S3method(formula,survtab) -S3method(getCall,rate) -S3method(getCall,sir) -S3method(getCall,survmean) -S3method(getCall,survtab) -S3method(lines,sirspline) -S3method(lines,survmean) -S3method(lines,survtab) -S3method(plot,rate) -S3method(plot,sir) -S3method(plot,sirspline) -S3method(plot,survmean) -S3method(plot,survtab) -S3method(print,aggre) -S3method(print,rate) -S3method(print,sir) -S3method(print,sirspline) -S3method(print,survtab) -S3method(print,yrs) -S3method(subset,aggre) -S3method(subset,rate) -S3method(subset,survmean) -S3method(subset,survtab) -S3method(summary,aggre) -S3method(summary,survtab) -export(Lexis_fpa) -export(RPL) -export(adjust) -export(aggre) -export(all_names_present) -export(as.aggre) -export(cast_simple) -export(cut_bound) -export(expr.by.cj) -export(fac2num) -export(get.yrs) -export(is.Date) -export(is_leap_year) -export(lexpand) -export(lower_bound) -export(ltable) -export(na2zero) -export(poisson.ci) -export(prepExpo) -export(rate) -export(rate_ratio) -export(relpois) -export(relpois_ag) -export(robust_values) -export(rpcurve) -export(setaggre) -export(setcolsnull) -export(sir) -export(sir_ag) -export(sir_exp) -export(sir_lex) -export(sir_ratio) -export(sirspline) -export(splitLexisDT) -export(splitMulti) -export(survmean) -export(survtab) -export(survtab_ag) -export(try2int) -import(Epi) -import(data.table) -import(grDevices) -import(graphics) -import(splines) -import(stats) -importFrom(survival,Surv) +# Generated by roxygen2: do not edit by hand + +S3method("[",aggre) +S3method("[",rate) +S3method("[",sir) +S3method("[",survmean) +S3method("[",survtab) +S3method("[",yrs) +S3method(as.Date,yrs) +S3method(as.aggre,data.frame) +S3method(as.aggre,data.table) +S3method(as.aggre,default) +S3method(as.data.frame,ratetable) +S3method(as.data.table,ratetable) +S3method(coef,sir) +S3method(confint,sir) +S3method(formula,survmean) +S3method(formula,survtab) +S3method(getCall,rate) +S3method(getCall,sir) +S3method(getCall,survmean) +S3method(getCall,survtab) +S3method(lines,sirspline) +S3method(lines,survmean) +S3method(lines,survtab) +S3method(plot,rate) +S3method(plot,sir) +S3method(plot,sirspline) +S3method(plot,survmean) +S3method(plot,survtab) +S3method(print,aggre) +S3method(print,rate) +S3method(print,sir) +S3method(print,sirspline) +S3method(print,survtab) +S3method(print,yrs) +S3method(subset,aggre) +S3method(subset,rate) +S3method(subset,survmean) +S3method(subset,survtab) +S3method(summary,aggre) +S3method(summary,survtab) +export(Lexis_fpa) +export(RPL) +export(adjust) +export(aggre) +export(all_names_present) +export(as.aggre) +export(cast_simple) +export(cut_bound) +export(expr.by.cj) +export(fac2num) +export(get.yrs) +export(is.Date) +export(is_leap_year) +export(lexpand) +export(lower_bound) +export(ltable) +export(na2zero) +export(poisson.ci) +export(prepExpo) +export(rate) +export(rate_ratio) +export(relpois) +export(relpois_ag) +export(robust_values) +export(rpcurve) +export(setaggre) +export(setcolsnull) +export(sir) +export(sir_ag) +export(sir_exp) +export(sir_lex) +export(sir_ratio) +export(sirspline) +export(splitLexisDT) +export(splitMulti) +export(survmean) +export(survtab) +export(survtab_ag) +export(try2int) +import(Epi) +import(data.table) +import(grDevices) +import(graphics) +import(splines) +import(stats) +importFrom(survival,Surv) diff --git a/NEWS.md b/NEWS.md index 0dd5f80..e55f2b0 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,137 +1,137 @@ -Changes in 0.4.4 -================ - -- splitLexisDT/splitMulti bug fix: splitting along multiple time scales *sometimes* produced duplicate transitions (e.g. alive -> dead in the last two rows). see for details. -- splitLexisDT/splitMulti now retain time.since attribute; this attribute plays a role in cutLexis -- known issue: splitLexisDT/splitMulti not guaranteed to work identically to splitLexis from Epi when there are NA values in the time scale one is splitting along. - -Changes in 0.4.3 -================ - -- survtab adjusting was broken with older versions of data.table (tested 1.9.6); therefore popEpi now requires the newest version of data.table! - -Changes in 0.4.2 -================ - -- **`survtab()` bug fix: standard errors were misspecificied for adjusted curves, e.g. age-adjusted Ederer II estimates. This resulted in too wide confidence intervals! SEE HERE FOR EAXMPLE: [\#135](https://github.com/WetRobot/popEpi/issues/135)**. The standard errors and confidence intervals of non-adjusted curves have always been correct. -- `survtab()` bug fix: confidence level was always 95 % regardless of `conf.level` [\#134](https://github.com/WetRobot/popEpi/issues/134) - -Changes in 0.4.1 -================ - -- `lexpand()` bug fixed (\#120): observations were dropped if their entry by age was smaller than the smallest age value, though entry at exit is correct and used now. -- `sir()` rewrite (\#118, \#122). New more consistent output, updates on plotting and minor changes in arguments. Introduce very simple `coef()` and `confint()` methods for sir class. -- new functions in sir family: `sir_ag()`, `sir_lex()` and `sir_exp()` for extracting SMRs from `aggre` and `Lexis` objects. -- fixed issue in internal test brought by pkg survival version 2.39.5; no changes in functions were needed (\#125) -- robustified `aggre()`; there were issues with Epi pkg dev version which are definitely avoided (\#119) - -Changes in 0.4.0 -================ - -- removed previously deprecated shift.var (\#35) -- popEpi no longer depends on package data.table but imports it - this means the user will have to do library(data.table) separately to make data.table's functions become usable. Formerly popEpi effectively did library(data.table) when you did library(popEpi). -- summary.survtab: args t and q behaviour changed -- survtab: internal weights now based on counts of subjects in follow-up at the start of follow-up (used to be sum of counts/pyrs over all of follow-up) -- new functions: `rate_ratio()`, `sir_ratio()` -- small internal changes in preparation for data.table 1.9.8 - -Changes in 0.3.1 -================ - -This is a hotfix. survtab() was causing warnings in certain situations, which this update fixes. Also fixed plotting survtab objects so that multiple strata are plotted correctly when one or more curves end before the longest one as well other small fixes: See Github issues \#89, \#90, \#91, and \#92. - -Changes in 0.3.0 -================ - -Adjusting ---------- - -Direct adjusting (computing weighted averages of estimates) has been generalized. Functions such as `survtab` and `survmean` allow for using `adjust()` mini function within formulas, or a separate `adjust` argument. Weights are passed separately. See the examples in the next chapter. See also `?direct_adjusting`. - -Estimating functions of survival time -------------------------------------- - -The `survtab` function computes observed, net/relative and cause-specific survivals as well as cumulative incidence functions for `Lexis` data. Any of the supported survival time functions can be easily adjusted by any number of categorical variables if needed. - -One can also use `survtab_ag` for aggregated data. This means the data does not have to be on the subject-level to compute survival time function estimates. - -``` r -## prep data -data(sibr) -sire$cancer <- "rectal" -sibr$cancer <- "breast" -sr <- rbind(sire, sibr) - -sr$cancer <- factor(sr$cancer) -sr <- sr[sr$dg_date < sr$ex_date, ] - -sr$status <- factor(sr$status, levels = 0:2, - labels = c("alive", "canD", "othD")) - -## create Lexis object -library(Epi) -#> -#> Attaching package: 'Epi' -#> The following object is masked from 'package:base': -#> -#> merge.data.frame -x <- Lexis(entry = list(FUT = 0, AGE = dg_age, CAL = get.yrs(dg_date)), - exit = list(CAL = get.yrs(ex_date)), - data = sr, - exit.status = status) -#> NOTE: entry.status has been set to "alive" for all. - -## population hazards file - see ?pophaz for general instructions -data(popmort) -pm <- data.frame(popmort) -names(pm) <- c("sex", "CAL", "AGE", "haz") - -## simple usage - uses lex.Xst as status variable -st <- survtab(FUT ~ cancer, data = x, - breaks = list(FUT = seq(0, 5, 1/12)), - surv.type = "surv.rel", pophaz = pm) - -## more explicit usage -library(survival) -st <- survtab(Surv(FUT, event = lex.Xst) ~ cancer, data = x, - breaks = list(FUT = seq(0, 5, 1/12)), - surv.type = "surv.rel", pophaz = pm) - - -## adjusting -x$agegr <- cut(x$dg_age, c(0,55,65,75,Inf)) -w <- as.numeric(table(x$agegr)) -st <- survtab(Surv(FUT, event = lex.Xst) ~ cancer + adjust(agegr), - data = x, - breaks = list(FUT = seq(0, 5, 1/12)), - surv.type = "surv.rel", - pophaz = pm, weights = w) -``` - -Rates ------ - -The new `rate` function enables easy calculation of e.g. standardized incidence rates: - -``` r -## dummy data - -a <- merge(0:1, 1:18) -names(a) <- c("sex", "agegroup") -set.seed(1) -a$obs <- rbinom(nrow(a), 100, 0.5) -set.seed(1) -a$pyrs <- rbinom(nrow(a), 1e4, 0.75) - -## so called "world" standard rates (weighted to hypothetical world pop in 2000) -r <- rate(data = a, obs = obs, pyrs = pyrs, print = sex, - adjust = agegroup, weights = 'world_2000_18of5') -#> Warning in pyrDSMNspOBEl * pyrDSMNspOBEl: NAs produced by integer overflow - -#> Warning in pyrDSMNspOBEl * pyrDSMNspOBEl: NAs produced by integer overflow -``` - -| sex| obs| pyrs| rate.adj| SE.rate.adj| rate.adj.lo| rate.adj.hi| rate| SE.rate| rate.lo| rate.hi| -|----:|----:|-------:|----------:|------------:|------------:|------------:|----------:|--------:|----------:|----------:| -| 0| 933| 134986| 0.0069947| 0.0002541| 0.0065140| 0.0075108| 0.0069118| NA| 0.0064822| 0.0073699| -| 1| 875| 134849| 0.0064453| 0.0002429| 0.0059865| 0.0069394| 0.0064887| NA| 0.0060727| 0.0069332| +Changes in 0.4.5 +================ + +- fixed errors arising from new data.table version + +Changes in 0.4.4 +================ + +- splitLexisDT/splitMulti bug fix: splitting along multiple time scales *sometimes* produced duplicate transitions (e.g. alive -> dead in the last two rows). see for details. +- splitLexisDT/splitMulti now retain time.since attribute; this attribute plays a role in cutLexis +- known issue: splitLexisDT/splitMulti not guaranteed to work identically to splitLexis from Epi when there are NA values in the time scale one is splitting along. + +Changes in 0.4.3 +================ + +- survtab adjusting was broken with older versions of data.table (tested 1.9.6); therefore popEpi now requires the newest version of data.table! + +Changes in 0.4.2 +================ + +- **`survtab()` bug fix: standard errors were misspecificied for adjusted curves, e.g. age-adjusted Ederer II estimates. This resulted in too wide confidence intervals! SEE HERE FOR EAXMPLE: [\#135](https://github.com/WetRobot/popEpi/issues/135)**. The standard errors and confidence intervals of non-adjusted curves have always been correct. +- `survtab()` bug fix: confidence level was always 95 % regardless of `conf.level` [\#134](https://github.com/WetRobot/popEpi/issues/134) + +Changes in 0.4.1 +================ + +- `lexpand()` bug fixed (\#120): observations were dropped if their entry by age was smaller than the smallest age value, though entry at exit is correct and used now. +- `sir()` rewrite (\#118, \#122). New more consistent output, updates on plotting and minor changes in arguments. Introduce very simple `coef()` and `confint()` methods for sir class. +- new functions in sir family: `sir_ag()`, `sir_lex()` and `sir_exp()` for extracting SMRs from `aggre` and `Lexis` objects. +- fixed issue in internal test brought by pkg survival version 2.39.5; no changes in functions were needed (\#125) +- robustified `aggre()`; there were issues with Epi pkg dev version which are definitely avoided (\#119) + +Changes in 0.4.0 +================ + +- removed previously deprecated shift.var (\#35) +- popEpi no longer depends on package data.table but imports it - this means the user will have to do library(data.table) separately to make data.table's functions become usable. Formerly popEpi effectively did library(data.table) when you did library(popEpi). +- summary.survtab: args t and q behaviour changed +- survtab: internal weights now based on counts of subjects in follow-up at the start of follow-up (used to be sum of counts/pyrs over all of follow-up) +- new functions: `rate_ratio()`, `sir_ratio()` +- small internal changes in preparation for data.table 1.9.8 + +Changes in 0.3.1 +================ + +This is a hotfix. survtab() was causing warnings in certain situations, which this update fixes. Also fixed plotting survtab objects so that multiple strata are plotted correctly when one or more curves end before the longest one as well other small fixes: See Github issues \#89, \#90, \#91, and \#92. + +Changes in 0.3.0 +================ + +Adjusting +--------- + +Direct adjusting (computing weighted averages of estimates) has been generalized. Functions such as `survtab` and `survmean` allow for using `adjust()` mini function within formulas, or a separate `adjust` argument. Weights are passed separately. See the examples in the next chapter. See also `?direct_adjusting`. + +Estimating functions of survival time +------------------------------------- + +The `survtab` function computes observed, net/relative and cause-specific survivals as well as cumulative incidence functions for `Lexis` data. Any of the supported survival time functions can be easily adjusted by any number of categorical variables if needed. + +One can also use `survtab_ag` for aggregated data. This means the data does not have to be on the subject-level to compute survival time function estimates. + +``` r +## prep data +data(sibr) +sire$cancer <- "rectal" +sibr$cancer <- "breast" +sr <- rbind(sire, sibr) + +sr$cancer <- factor(sr$cancer) +sr <- sr[sr$dg_date < sr$ex_date, ] + +sr$status <- factor(sr$status, levels = 0:2, + labels = c("alive", "canD", "othD")) + +## create Lexis object +library(Epi) +x <- Lexis(entry = list(FUT = 0, AGE = dg_age, CAL = get.yrs(dg_date)), + exit = list(CAL = get.yrs(ex_date)), + data = sr, + exit.status = status) +#> NOTE: entry.status has been set to "alive" for all. + +## population hazards file - see ?pophaz for general instructions +data(popmort) +pm <- data.frame(popmort) +names(pm) <- c("sex", "CAL", "AGE", "haz") + +## simple usage - uses lex.Xst as status variable +st <- survtab(FUT ~ cancer, data = x, + breaks = list(FUT = seq(0, 5, 1/12)), + surv.type = "surv.rel", pophaz = pm) + +## more explicit usage +library(survival) +st <- survtab(Surv(FUT, event = lex.Xst) ~ cancer, data = x, + breaks = list(FUT = seq(0, 5, 1/12)), + surv.type = "surv.rel", pophaz = pm) + + +## adjusting +x$agegr <- cut(x$dg_age, c(0,55,65,75,Inf)) +w <- as.numeric(table(x$agegr)) +st <- survtab(Surv(FUT, event = lex.Xst) ~ cancer + adjust(agegr), + data = x, + breaks = list(FUT = seq(0, 5, 1/12)), + surv.type = "surv.rel", + pophaz = pm, weights = w) +``` + +Rates +----- + +The new `rate` function enables easy calculation of e.g. standardized incidence rates: + +``` r +## dummy data + +a <- merge(0:1, 1:18) +names(a) <- c("sex", "agegroup") +set.seed(1) +a$obs <- rbinom(nrow(a), 100, 0.5) +set.seed(1) +a$pyrs <- rbinom(nrow(a), 1e4, 0.75) + +## so called "world" standard rates (weighted to hypothetical world pop in 2000) +r <- rate(data = a, obs = obs, pyrs = pyrs, print = sex, + adjust = agegroup, weights = 'world_2000_18of5') +#> Warning in pyrDSMNspOBEl * pyrDSMNspOBEl: NAs produced by integer overflow + +#> Warning in pyrDSMNspOBEl * pyrDSMNspOBEl: NAs produced by integer overflow +``` + +| sex| obs| pyrs| rate.adj| SE.rate.adj| rate.adj.lo| rate.adj.hi| rate| SE.rate| rate.lo| rate.hi| +|----:|----:|-------:|----------:|------------:|------------:|------------:|----------:|--------:|----------:|----------:| +| 0| 933| 134986| 0.0069947| 0.0002541| 0.0065140| 0.0075108| 0.0069118| NA| 0.0064822| 0.0073699| +| 1| 875| 134849| 0.0064453| 0.0002429| 0.0059865| 0.0069394| 0.0064887| NA| 0.0060727| 0.0069332| diff --git a/R/S3_definitions.R b/R/S3_definitions.R index 514f9db..60d8121 100644 --- a/R/S3_definitions.R +++ b/R/S3_definitions.R @@ -1,1473 +1,1473 @@ - - -#' @export -print.sir <- function(x, subset = NULL, ...) { - - at <- attributes(x)$sir.meta - PF <- parent.frame(1L) - subset <- evalLogicalSubset(x, substitute(subset), enclos = PF) - x <- x[subset, ] - setDT(x) - - t1 <- paste0("SIR (adjusted by ", paste(at$adjust, collapse = ', '),')', - ' with ', at$conf.level*100, '% ', 'confidence intervals (', at$conf.type,')') - - - # cat - t3 <- paste0(' Total sir: ', round(at$pooled.sir$sir,2),' (', - round(at$pooled.sir$sir.lo,2),'-', round(at$pooled.sir$sir.hi, 2),')\n', - ' Total observed: ', at$pooled.sir$observed, '\n', - ' Total expected: ', round(at$pooled.sir$expected,2), '\n', - ' Total person-years: ', round(at$pooled.sir$pyrs)) - - rv <- intersect(names(x), c('sir','sir.lo','sir.hi','observed','expected','pyrs')) - if (length(rv)) { - x[, (rv) := lapply(.SD, round, digits = 2L), .SDcols = rv] - } - - rv <- intersect(names(x), c('p_value')) - if (length(rv)) { - x[, (rv) := lapply(.SD, round, digits = 4), .SDcols = rv] - } - - - if(is.null(at$lrt.test)) { - d <- paste("Could not test", at$lrt.test.type) - } else { - if(at$lrt.test.type == 'homogeneity') { - d <- paste("Test for homogeneity: p", p.round( c(at$lrt.test))) - } - if(at$lrt.test.type == 'trend') { - d <- paste("Test for trend: p", p.round( c(at$lrt.test))) - } - } - #b <- round(c(ta$total$sir, ta$total$sir.lo, ta$total$sir.hi), 2) - # cat('\n',"Total observed", ta$total$observed, '\n', - # "Total expected:", ta$total$expected, '\n', - # "SIR:", paste0(b[1], ' (',b[2], '-',b[3],')'), '\n', - # "Person-years:", ta$total$pyrs, '\n', - # fill=TRUE) - - cat(t1, '\n') - if(x[,.N] > 1) { - cat(d, '\n') - } - cat(fill=TRUE) - cat(t3, '\n', fill=TRUE) - - print(data.table(x), ...) - return(invisible()) -} - -#' @export -`[.sir` <- function(x, ...) { - y <- NextMethod() - if (is.data.frame(y)) { - setattr(y, "class", class(x)) - setattr(y, "sir.meta", attr(x, "sir.meta")) - } - y -} - - -#' @import grDevices -#' @export -print.sirspline <- function(x, ...) { - if ( x$spline.dependent ) { - if( any( !is.na(x$p.values))) { - cat( 'global p-value:', p.round(x$p.values[1]),'\n' ) - cat( 'level p-value:', p.round(x$p.values[2]) , fill= TRUE) - } else { - cat( 'No models compared.', fill= TRUE) - } - cat('---', '\n') - cat('Colour codes:', '\n', fill=TRUE) - } else { - - for(i in 1:length(x$p.values)) { - cat( x$spline[i] ,': p ', p.round( x$p.values[[i]] ), '\n', sep = '') - } - cat(fill=TRUE) - - } - # Print colour codes: - cols <- unique(x$spline.est.A[,1]) - col.length <- length(cols) - print( data.frame(levels = cols, colour = palette()[1:col.length]), include.rownames = FALSE) - - # Print p-values - return(invisible()) -} - - -#' Plot method for sir-object -#' -#' Plot SIR estimates with error bars -#' -#' @seealso \code{\link{sir}}, \code{\link{sirspline}} -#' -#' @import graphics -#' -#' @author Matti Rantanen -#' -#' @param x an object returned by function \code{sir} -#' @param conf.int default TRUE draws confidence intervals -#' @param xlab overwrites default x-axis label -#' @param ylab overwrites default y-axis label -#' @param xlim x-axis minimum and maximum values -#' @param main optional plot title -#' @param abline logical; draws a grey line in SIR = 1 -#' @param log logical; SIR is not in log scale by default -#' @param eps error bar vertical bar height (works only in 'model' or 'univariate') -#' @param left.margin adjust left marginal of the plot to fit long variable names -#' @param ... arguments passed on to plot(), segment and lines() -#' -#' -#' @details Plot SIR estimates and confidence intervals -#' \itemize{ -#' \item univariate - plots SIR with univariate confidence intervals -#' \item model - plots SIR with Poisson modelled confidence intervals -#' } -#' -#' \strong{Customize} -#' Normal plot parameters can be passed to \code{plot}. These can be a vector when plotting error bars: -#' \itemize{ -#' \item \code{pch} - point type -#' \item \code{lty} - line type -#' \item \code{col} - line/point colour -#' \item \code{lwd} - point/line size -#' } -#' -#' \strong{Tips for plotting splines} -#' It's possible to use \code{plot} to first draw the -#' confidence intervals using specific line type or colour and then plotting -#' again the estimate using \code{lines(... , conf.int = FALSE)} with different -#' settings. This works only when \code{plot.type} is 'splines'. -#' -#' -#' @examples -#' \dontrun{ -#' # Plot SIR estimates -#'# plot(sir.by.gender, col = c(4,2), log=FALSE, eps=0.2, lty=1, lwd=2, pch=19, -#'# main = 'SIR by gender', abline=TRUE) -#' } -#' @export - -plot.sir <- function(x, conf.int = TRUE, ylab, xlab, xlim, main, - eps=0.2, abline = TRUE, log = FALSE, left.margin, ...) { - - a <- data.table(x) - at <- attributes(x)$sir.meta - level_names <- at$print - - if(is.null(level_names)) { - levels <- 'Crude' - level_names <- levels - } - else { - q <- paste0('paste(', paste(level_names, collapse=', '),', sep = ":")' ) - q <- parse(text = q) - levels <- a[, eval(q)] - } - - # predefined parameters - if( missing(main) ){ - main <- NA - } - if( missing(xlab) ){ - xlab <- 'SIR' - } - if( missing(ylab) ){ - ylab <- NA - } - if( missing(xlim) ) { - xlimit <- c(min(a$sir.lo[a$sir.lo 1) { - save_par <- par(no.readonly = TRUE) - par(mfrow=c(1,sum(plotdim))) - type <- 'l' - } - - ## set labels - if ( missing(xlab) ) { - xlab <- x$spline - } - - if ( missing(ylab) ) { - ylab <- rep('SIR',sum(plotdim)) - if(log){ - ylab <- rep('log(SIR)', sum(plotdim)) - } - if(x$spline.dependent & sum(plotdim) > 1) { - ylab <- c(ylab[1], paste(ylab[2:sum(plotdim)], 'ratio')) - } - } - else{ - if( length(ylab) < sum(plotdim)) - ylab <- rep(ylab, sum(plotdim)) - if(length(ylab) > sum(plotdim)) { - warning('set ylabs in a vector length of num of plots (',sum(plotdim),')') - } - } - - ## set scale - if(!is.logical(log)) stop('log should be a logical value.') - log.bin <- ifelse(log, 'y', '') - - ## remove infinite values - #rm_inf <- function(est){ - # x[[est]][ is.finite(x[[est]][[2]]) & is.finite(x[[est]][[3]]) & is.finite(x[[est]][[4]]), ] - #} - - spl <- c('spline.seq.A', 'spline.seq.B', 'spline.seq.C')[1:sum(plotdim)] - est <- gsub("seq", "est", spl) - - for (i in 1:sum(plotdim)) { # age, per, fot, - # empty plot - max_x <- range(x[[spl[i]]]) - max_y <- range( x[[est[i]]][, 2:4] ) - plot(max_x, max_y, type = 'n', ylab = ylab[i], xlab = xlab[i], log = log.bin, ...) - if(abline) abline(h = 1) - - # plot lines - if (missing(type) || type != 'n') { - lines.sirspline(x, conf.int = conf.int, select.spline = i, ...) - } - } - if(sum(plotdim) > 1) { - par(save_par) - } - return(invisible()) -} - - - -#' @title lines method for sirspline-object -#' @description Plot SIR spline lines with R base graphics -#' -#' -#' @author Matti Rantanen -#' -#' @param x an object returned by function sirspline -#' @param conf.int logical; default TRUE draws also the 95 confidence intervals -#' @param print.levels name(s) to be plotted. Default plots all levels. -#' @param select.spline select which spline variable (a number or a name) is plotted. -#' @param ... arguments passed on to lines() -#' -#' @details In \code{lines.sirspline} most of graphical parameters is user -#' adjustable. -#' Desired spline variable can be selected with \code{select.spline} and only one -#' can be plotted at a time. The spline variable can include -#' several levels, e.g. gender (these are the levels of \code{print} -#' from \code{sirspline}). All levels are printed by default, but a -#' specific level can be selected using argument -#' \code{print.levels}. Printing the levels separately enables e.g. to -#' give different colours for each level. -#' -#' @family sir functions -#' -#' @import graphics -#' @export -lines.sirspline <- function(x, conf.int = TRUE, print.levels = NA, select.spline, ... ){ - ## input: sirspline object, with only one spline var (spline.est.A) - ## input: print levels can be > 1. - - ## subset splines - if( length(x$spline) > 1 ) { - if ( missing(select.spline) ) { - stop(paste('select what spline to plot in select.spline:', paste(x$spline, collapse = ', '))) - } - else { - if(is.numeric(select.spline)) { - k <- select.spline - } - else { - k <- which(x$spline == select.spline) - } - if(length(k) == 0 | length(x$spline) < k) stop('select.spline name/number is incorrect') - } - } - else { - k <- 1 - } - - spl <- c('spline.seq.A', 'spline.seq.B', 'spline.seq.C')[k] - est <- gsub("seq", "est", spl) - - ## remove infinite values - # x[[h]] <- rm_inf(est=h) - - # get print levels - if(missing(print.levels)) { - print.levels <- NA - } - pl <- unique(x$spline.est.A[,1]) - if(any( is.null(print.levels), is.na(print.levels))) { - print.levels <- pl - } - pl <- pl[ pl %in% print.levels] - - ## get conf.int - if( !is.logical(conf.int) ) stop('conf.int is not logical') - n <- c(2,4)[c(!conf.int, conf.int)] - - - ## draw lines - for( l in pl ){ - # loop through print.levels - index <- which(x$spline.est.A$i == l) - - for(m in 2:n) { - # loop through estiamte and confidence intervals - lines(x = x[[spl]], y = x[[est]][index, m], ...) - } - } -} - -#' @title Print an rate object -#' @author Matti Rantanen -#' @description Print method function for \code{rate} objects; see -#' \code{\link{rate}}. -#' @param x an \code{rate} object -#' @param subset a logical condition to subset results table by -#' before printing; use this to limit to a certain stratum. E.g. -#' \code{subset = sex == "female"} -#' @param ... arguments for data.tables print method, e.g. row.names = FALSE suppresses row numbers. -#' @export -print.rate <- function(x, subset = NULL, ...) { - - ra <- attributes(x)$rate.meta - PF <- parent.frame(1L) - TF <- environment() - subset <- evalLogicalSubset(x, substitute(subset), enclos = PF) - x <- x[subset, ] - - # pre texts: - cat('\n') - if(!is.null(ra$adjust)){ - if(is.character(ra$weights)) { - a <- paste(ra$weights, collapse = ',') - } - if(all(is.numeric(ra$weights))) { - a <- length(ra$weights) - } - if(is.list(ra$weights)) { - a <- sapply(ra$weights, length) - } - - b <- paste(ra$adjust,a, collapse = ', ', sep = '; ') - cat('Adjusted rates (', b,') ', sep = '') - } - else{ - cat('Crude rates ') - } - cat('and', '95%', 'confidence intervals:', fill=TRUE) - cat('\n') - # table itself - - - setDT(x) - print(x, ...) -} - - -#' @title plot method for rate object -#' @description Plot rate estimates with confidence intervals lines using R base graphics -#' @author Matti Rantanen -#' -#' @param x a rate object (see \code{\link{rate}}) -#' @param conf.int logical; default TRUE draws the confidence intervals -#' @param eps is the height of the ending of the error bars -#' @param left.margin set a custom left margin for long variable names. Function -#' tries to do it by default. -#' @param xlim change the x-axis location -#' @param ... arguments passed on to graphical functions points and segment -#' (e.g. \code{col}, \code{lwd}, \code{pch} and \code{cex}) -#' -#' @details This is limited explanatory tool but most graphical -#' parameters are user adjustable. -#' -#' @import graphics -#' @export -plot.rate <- function(x, conf.int = TRUE, eps = 0.2, left.margin, xlim, ...) { - - ra <- attributes(x)$rate.meta - varcol <- ra$print - - if(is.null(varcol)) { - lvl.name <- 'Crude' - } - else { - pp <- paste0('paste(', paste(varcol, collapse=','),',sep = ":")') - q <- parse(text=pp) - lvl.name <- x[,eval(q)] - } - lvls <- 1:length(lvl.name) - - # WHICH RATE: - if('rate.adj' %in% names(x)) { - r <- x$rate.adj - hi <- x$rate.adj.hi - lo <- x$rate.adj.lo - } - else { - r <- x$rate - hi <- x$rate.hi - lo <- x$rate.lo - } - # X-AXIS LIMITs - if(missing(xlim)) { - t <- range(na.omit(c(lo , r, hi))) - t0 <- (t[2]-t[1])/4 - xlimit <- c(pmax(t[1]-t0, 0), t[2] + t0) - } - else { - xlimit <- xlim - } - - # MARGINS - if(missing(left.margin)) { - old.margin <- new.margin <- par("mar") - new.margin[2] <- 4.1 + sqrt( max(nchar(as.character(lvl.name))) )*2 - } - else { - new.margin[2] <- left.margin - } - par(mar = new.margin) - - plot(c(xlimit), c(min(lvls)-0.5, max(lvls)+0.5), type='n', yaxt = 'n', ylab = '', xlab='') - axis(side = 2, at = lvls, labels = lvl.name, las = 1) - points(r, lvls, ...) - - if(conf.int) { - segments(lo, lvls, hi, lvls, ...) - segments(lo, lvls - eps, lo, lvls + eps, ...) - segments(hi, lvls - eps, hi, lvls + eps, ...) - } - par(mar = old.margin) # revert margins -} - - -#' @export -print.yrs <- function(x, ...) { - print(as.numeric(x)) -} - - -#' @export -`[.yrs` <- function(x, ...) { - yl <- attr(x, "year.length") - structure(NextMethod(), year.length = yl, class = c("yrs", "numeric")) -} - - -#' @export -`[.aggre` <- function(x, ...) { - xa <- attributes(x) - y <- NextMethod() - if (is.data.frame(y)) { - setattr(y, "class", xa$class) - setattr(y, "aggre.meta", xa$aggre.meta) - setattr(y, "breaks", xa$breaks) - } - y -} - -#' @export -subset.aggre <- function(x, ...) { - y <- NextMethod() - if (is.data.frame(y)) { - setattr(y, "class", class(x)) - setattr(y, "aggre.meta", attr(x, "aggre.meta")) - setattr(y, "breaks", attr(x, "breaks")) - } - y -} - -preface_survtab.print <- function(x) { - surv.int <- NULL ## APPEASE R CMD CHECK - at <- attributes(x)$survtab.meta - arg <- at$arguments - - cat("\n") - cat("Call: \n", oneWhitespace(deparse(at$call)), "\n") - cat("\n") - cat("Type arguments: \n surv.type:", as.character(arg$surv.type), - "--- surv.method:", as.character(arg$surv.method)) - if (as.character(arg$surv.type) == "surv.rel") - cat(" --- relsurv.method:", as.character(arg$relsurv.method)) - cat("\n \n") - cat("Confidence interval arguments: \n level:", - as.character(arg$conf.level*100), "%") - cat(" --- transformation:", - as.character(arg$conf.type)) - cat("\n \n") - cat("Totals:") - totCat <- paste0("\n person-time:", round(sum(x$pyrs))) - if (arg$surv.method == "lifetable") { - totCat <- paste0("\n at-risk at T=0: ", round(sum(x[surv.int == 1L]$n))) - } - - cat(totCat) - cat(" --- events:", sum(x$d)) - cat("\n \n") - if (length(at$print.vars) > 0L) { - cat("Stratified by:", paste0("'", at$print.vars, "'", collapse = ", ")) - if (length(at$adjust.vars) > 0L) cat(" --- ") - } - if (length(at$adjust.vars) > 0L) { - cat("Adjusted by:", paste0("'", at$adjust.vars, "'", collapse = ", ")) - } - cat("\n") - invisible() -} - - -#' @title Print an \code{aggre} Object -#' @author Joonas Miettinen -#' @description Print method function for \code{aggre} objects; see -#' \code{\link{as.aggre}} and \code{\link{aggre}}. -#' @param x an \code{aggre} object -#' @param subset a logical condition to subset results table by -#' before printing; use this to limit to a certain stratum. E.g. -#' \code{subset = sex == "male"} -#' @param ... arguments passed to \code{print.data.table}; try e.g. -#' \code{top = 2} for numbers of rows in head and tail printed -#' if the table is large, -#' \code{nrow = 100} for number of rows to print, etc. -#' @export -print.aggre <- function(x, subset = NULL, ...) { - - PF <- parent.frame(1L) - TF <- environment() - sa <- attributes(x)$aggre.meta - - subset <- evalLogicalSubset(x, substitute(subset), enclos = PF) - x <- x[subset, ] - setDT(x) - - print(x, ...) - -} - -#' @title Summarize an \code{aggre} Object -#' @author Joonas Miettinen -#' @description \code{summary} method function for \code{aggre} objects; see -#' \code{\link{as.aggre}} and \code{\link{aggre}}. -#' @param object an \code{aggre} object -#' @param by list of columns to summarize by - e.g. \code{list(V1, V2)} -#' where \code{V1} and \code{V2} are columns in the data. -#' @param subset a logical condition to subset results table by -#' before summarizing; use this to limit to a certain stratum. E.g. -#' \code{subset = sex == "male"} -#' @param ... unused -#' @export -#' @family aggregation functions -summary.aggre <- function(object, by = NULL, subset = NULL, ...) { - - PF <- parent.frame(1L) - TF <- environment() - x <- object - sa <- attributes(x)$aggre.meta - - subset <- evalLogicalSubset(x, substitute(subset), enclos = PF) - x <- x[subset, ] - setDT(x) - - bys <- substitute(by) - bye <- evalPopArg(x, bys, enclos = environment(), types = c("list", "NULL")) - - vals <- sa$values - vals <- intersect(names(x), vals) - if (!length(vals)) { - cat("No originally created value columns appear to be left in data.") - } - r <- x[, lapply(.SD, sum), by = eval(bye), .SDcols = vals] - r -} - -#' @title Print a survtab Object -#' @author Joonas Miettinen -#' @description Print method function for \code{survtab} objects; see -#' \code{\link{survtab_ag}}. -#' @param x a \code{survtab} object -#' @param subset a logical condition to subset results table by -#' before printing; use this to limit to a certain stratum. E.g. -#' \code{subset = sex == "male"} -#' @param ... arguments passed to \code{print.data.table}; try e.g. -#' \code{top = 2} for numbers of rows in head and tail printed -#' if the table is large, -#' \code{nrow = 100} for number of rows to print, etc. -#' @export -#' @family survtab functions -print.survtab <- function(x, subset = NULL, ...) { - - Tstart <- Tstop <- NULL ## APPEASE R CMD CHECK - - PF <- parent.frame(1L) - TF <- environment() - sa <- attributes(x)$survtab.meta - - subset <- evalLogicalSubset(x, substitute(subset), enclos = PF) - x <- x[subset, ] - - preface_survtab.print(x) - - setDT(x) - - if (nrow(x) == 0L) { - print(x) - return(invisible()) - } - - pv <- as.character(sa$print.vars) - if (length(pv) == 0L) pv <- NULL - - magicMedian <- function(x) { - if (length(x) %% 2L == 0L) median(x[-1L], na.rm = TRUE) else - median(x, na.rm = TRUE) - } - - ## to avoid e.g. 'factor(V1, 1:2)' going bonkers - pv_orig <- pv - if (length(pv) > 0L) { - pv <- makeTempVarName(x, pre = paste0("print_", 1:length(pv))) - setnames(x, pv_orig, pv) - } - - medmax <- x[, list(Tstop = c(magicMedian(c(min(Tstart),Tstop)), max(Tstop))), keyby = eval(pv)] - - setkeyv(medmax, c(pv, "Tstop")) - setkeyv(x, c(pv, "Tstop")) - x <- x[medmax] - - rv <- intersect(names(x), c(sa$est.vars, sa$CI.vars, sa$misc.vars)) - if (length(rv)) { - x[, (rv) := lapply(.SD, round, digits = 4L), .SDcols = rv] - } - - sv <- intersect(names(x), sa$SE.vars) - if (length(sv > 0L)) { - x[, c(sv) := lapply(.SD, signif, digits = 4L), .SDcols = sv] - } - - - setcolsnull(x, keep = c(pv, "Tstop", sa$surv.vars), colorder = TRUE) - if (length(pv)) setnames(x, pv, pv_orig) - print(data.table(x), ...) - invisible() -} - -#' @title Summarize a survtab Object -#' @author Joonas Miettinen -#' @description Summary method function for \code{survtab} objects; see -#' \code{\link{survtab_ag}}. Returns estimates at given time points -#' or all time points if \code{t} and \code{q} are both \code{NULL}. -#' @param object a \code{survtab} object -#' @param t a vector of times at which time points (actually intervals that -#' contain t) to print summary table of survival function estimates by strata; -#' values not existing in any interval cause rows containing only \code{NAs} to -#' be returned. -#' @param q a named \code{list} of quantiles to include in returned data set, -#' where names must match to estimates in \code{object}; -#' returns intervals where the quantiles are reached first; -#' e.g. \code{list(surv.obs = 0.5)} finds the interval where \code{surv.obs} -#' is 0.45 and 0.55 at the beginning and end of the interval, respectively; -#' returns rows with \code{NA} values for quantiles not reached in estimates -#' (e.g. if \code{q = list(surv.obs = 0.5)} but lowest estimate is 0.6); -#' see Examples. -#' @param subset a logical condition to subset results table by -#' before printing; use this to limit to a certain stratum. E.g. -#' \code{subset = sex == "male"} -#' @param ... unused; required for congruence with other \code{summary} methods -#' -#' @details -#' Note that this function returns the intervals and NOT the time points -#' corresponding to quantiles / estimates corresponding to time points. -#' If you want precise estimates at time points that are not interval breaks, -#' add the time points as breaks and re-estimate the survival time function. -#' In interval-based estimation, the estimates denote e.g. probability of -#' dying \emph{during} the interval, so time points within the intervals -#' are not usually considered at all. See e.g. Seppa, Dyba, and Hakulinen -#' (2015). -#' -#' @references -#' Seppa K., Dyba T. and Hakulinen T.: Cancer Survival, -#' Reference Module in Biomedical Sciences. Elsevier. 08-Jan-2015 -#' doi: 10.1016/B978-0-12-801238-3.02745-8. -#' -#' @examples -#' -#' library(Epi) -#' library(survival) -#' -#' ## NOTE: recommended to use factor status variable -#' x <- Lexis(entry = list(FUT = 0, AGE = dg_age, CAL = get.yrs(dg_date)), -#' exit = list(CAL = get.yrs(ex_date)), -#' data = sire[sire$dg_date < sire$ex_date, ], -#' exit.status = factor(status, levels = 0:2, -#' labels = c("alive", "canD", "othD")), -#' merge = TRUE) -#' ## pretend some are male -#' set.seed(1L) -#' x$sex <- rbinom(nrow(x), 1, 0.5) -#' ## observed survival -#' st <- survtab(Surv(time = FUT, event = lex.Xst) ~ sex, data = x, -#' surv.type = "cif.obs", -#' breaks = list(FUT = seq(0, 5, 1/12))) -#' -#' ## estimates at full years of follow-up -#' summary(st, t = 1:5) -#' -#' ## interval estimate closest to 75th percentile, i.e. -#' ## first interval where surv.obs < 0.75 at end -#' ## (just switch 0.75 to 0.5 for median survival, etc.) -#' summary(st, q = list(surv.obs = 0.75)) -#' ## multiple quantiles -#' summary(st, q = list(surv.obs = c(0.75, 0.90), CIF_canD = 0.20)) -#' -#' ## if you want all estimates in a new data.frame, you can also simply do -#' -#' x <- as.data.frame(st) -#' -#' @export -#' @family survtab functions -summary.survtab <- function(object, t = NULL, subset = NULL, q = NULL, ...) { - - PF <- parent.frame(1L) - at <- copy(attr(object, "survtab.meta")) - subr <- copy(at$surv.breaks) - - if (!is.null(t) && !is.null(q)) { - stop("Only supply either t or q.") - } - - sb <- substitute(subset) - subset <- evalLogicalSubset(object, sb, enclos = PF) - x <- object[subset, ] - - ## to avoid e.g. 'factor(V1, 1:2)' going bonkers - pv_orig <- pv <- at$print.vars - if (length(pv) > 0L) { - pv <- makeTempVarName(x, pre = paste0("print_", 1:length(pv))) - setnames(x, pv_orig, pv) - } - - setDT(x) - - ## quantile detection -------------------------------------------------------- - if (!is.null(q)) { - bn <- setdiff(names(q), at$est.vars) - if (length(bn) > 0L) { - stop("No survival time function estimates named ", - paste0("'", bn, "'", collapse = ", "), - " found in supplied survtab object. Available ", - "survival time function estimates: ", - paste0("'", at$est.vars, "'", collapse = ", ")) - } - - lapply(q, function(x) { - if (min(x <= 0L) || max(x >= 1L)) { - stop("Quantiles must be expressed as numbers between 0 and 1, ", - "e.g. surv.obs = 0.5.") - } - }) - - - m <- x[, .SD[1, ], keyby = eval(pv)][, c(pv, "Tstop"), with = FALSE] - setDF(m) - - rollVars <- makeTempVarName(x, pre = names(q)) - x[, c(rollVars) := lapply(.SD, copy), .SDcols = names(q)] - - m <- lapply(seq_along(q), function(i) { - m <- merge(m, q[[i]]) - setnames(m, "y", rollVars[i]) - if (length(pv)) setorderv(m, pv) - m[, c(pv, rollVars[i]), drop = FALSE] - }) - names(m) <- names(q) - - l <- vector("list", length(q)) - names(l) <- names(q) - for (k in names(q)) { - - l[[k]] <- setDT(x[m[[k]], on = names(m[[k]]), roll = 1L]) - - } - l <- rbindlist(l) - set(l, j = rollVars, value = NULL) - if (length(pv)) setkeyv(l, pv) - x <- l - } - - ## time point detection ------------------------------------------------------ - - if (!is.null(t)) { - - tcutv <- makeTempVarName(x, pre = "cut_time_") - - set(x, j = tcutv, value = cut(x$Tstop, breaks = subr, right = T, - include.lowest = F)) - cutt <- cut(t, breaks = subr, right = T, include.lowest = F) - - l <- list(cutt) - names(l) <- tcutv - if (length(pv)) { - pvdt <- setDF(unique(x, by = pv))[, pv, drop = FALSE] - l <- setDT(merge(pvdt, as.data.frame(l))) - setkeyv(l, pv) - } - - x <- x[l, on = c(pv, tcutv)] - - set(x, j = tcutv, value = NULL) - if (length(pv)) setkeyv(x, pv) - } - - - ## final touches ------------------------------------------------------------- - if (length(pv) > 0L) setnames(x, pv, pv_orig) - - if (!return_DT()) setDFpe(x) - - x -} - -#' @export -`[.survtab` <- function(x, ...) { - y <- NextMethod() - if (is.data.frame(y)) { - setattr(y, "class", class(x)) - setattr(y, "survtab.meta", attr(x, "survtab.meta")) - } - y -} - -#' @export -subset.survtab <- function(x, ...) { - y <- NextMethod() - if (is.data.frame(y)) { - setattr(y, "class", class(x)) - setattr(y, "survtab.meta", attr(x, "survtab.meta")) - } - y -} - - -#' @export -`[.survmean` <- function(x, ...) { - y <- NextMethod() - if (is.data.frame(y)) { - setattr(y, "class", class(x)) - setattr(y, "survmean.mean", attr(x, "survmean.mean")) - } - y -} - - -#' @export -subset.survmean <- function(x, ...) { - y <- NextMethod() - if (is.data.frame(y)) { - setattr(y, "class", class(x)) - setattr(y, "survmean.mean", attr(x, "survmean.mean")) - } - y -} - -#' @export -`[.rate` <- function(x, ...) { - y <- NextMethod() - if (is.data.frame(y)) { - setattr(y, "class", class(x)) - setattr(y, "rate.meta", attr(x, "rate.meta")) - } - y -} - -#' @export -subset.rate <- function(x, ...) { - y <- NextMethod() - if (is.data.frame(y)) { - setattr(y, "class", class(x)) - setattr(y, "rate.meta", attr(x, "rate.meta")) - } - y -} - - - - - -prep_plot_survtab <- function(x, - y = NULL, - subset = NULL, - conf.int = TRUE, - enclos = parent.frame(1L), - ...) { - - ## subsetting ---------------------------------------------------------------- - subset <- evalLogicalSubset(data = x, substiset = substitute(subset), - enclos = environment()) - - attrs <- attributes(x) - - if (!inherits(x, "survtab")) stop("x is not a survtab object") - if (is.null(attrs$survtab.meta)) { - stop("Missing meta information (attributes) in survtab object; ", - "have you tampered with it after estimation?") - } - strata.vars <- attrs$survtab.meta$print.vars - x <- copy(x) - setDT(x) - x <- x[subset, ] - - ## detect survival variables in data ----------------------------------------- - surv_vars <- c("surv.obs","CIF.rel","CIF_","r.e2","r.pp") - wh <- NULL - for (k in surv_vars) { - wh <- c(wh, which(substr(names(x), 1, nchar(k)) == k)) - } - surv_vars <- names(x)[wh] - surv_vars <- surv_vars[!substr(surv_vars, nchar(surv_vars)-1, nchar(surv_vars)) %in% c("hi","lo")] - if (length(surv_vars) == 0) { - stop("x does not appear to have any survival variables; ", - "did you tamper with it after estimation?") - } - - - ## getting y ----------------------------------------------------------------- - if (!is.null(y)) { - if (!is.character(y)) { - stop("please supply y as a character string indicating ", - "the name of a variable in x") - } - if (length(y) > 1) stop("y must be of length 1 or NULL") - if (!all_names_present(x, y, stops = FALSE)) { - stop("Given survival variable in argument 'y' ", - "not present in survtab object ('", y, "')") - } - } else { - y <- surv_vars[length(surv_vars)] - if (length(surv_vars) > 1L) message("y was NULL; chose ", y, " automatically") - } - rm(surv_vars) - - if (substr(y, 1, 3) == "CIF" && conf.int) { - stop("No confidence intervals currently supported for CIFs. ", - "Hopefully they will be added in a future version; ", - "meanwhile use conf.int = FALSE when plotting CIFs.") - } - - - ## confidence intervals ------------------------------------------------------ - y.lo <- y.hi <- y.ci <- NULL - if (conf.int) { - - y.lo <- paste0(y, ".lo") - y.hi <- paste0(y, ".hi") - y.ci <- c(y.lo, y.hi) - - badCIvars <- setdiff(y.ci, names(x)) - if (sum(length(badCIvars))) { - stop("conf.int = TRUE, but missing confidence interval ", - "variables in data for y = '", y, "' (could not detect ", - "variables named", paste0("'", badCIvars, "'", collapse = ", ") ,")") - } - - } - - list(x = x, y = y, y.ci = y.ci, y.lo = y.lo, y.hi = y.hi, - strata = strata.vars, attrs = attrs) - -} - - - - - -#' \code{plot} method for survtab objects -#' -#' Plotting for \code{survtab} objects -#' -#' @import graphics -#' -#' @author Joonas Miettinen -#' -#' @param x a \code{survtab} output object -#' @param y survival a character vector of a variable names to plot; -#' e.g. \code{y = "r.e2"} -#' @param subset a logical condition; \code{obj} is subset accordingly -#' before plotting; use this for limiting to specific strata, -#' e.g. \code{subset = sex == "male"} -#' @param conf.int logical; if \code{TRUE}, also plots any confidence intervals -#' present in \code{obj} for variables in \code{y} -#' @param col line colour; one value for each stratum; will be recycled -#' @param lty line type; one value for each stratum; will be recycled -#' @param ylab label for Y-axis -#' @param xlab label for X-axis -#' @param ... additional arguments passed on to \code{plot} and -#' \code{lines.survtab}; e.g. \code{ylim} can be defined this way -#' @examples -#' data(sire) -#' data(sibr) -#' si <- rbind(sire, sibr) -#' si$period <- cut(si$dg_date, as.Date(c("1993-01-01", "2004-01-01", "2013-01-01")), right = FALSE) -#' si$cancer <- c(rep("rectal", nrow(sire)), rep("breast", nrow(sibr))) -#' x <- lexpand(si, birth = bi_date, entry = dg_date, exit = ex_date, -#' status = status %in% 1:2, -#' fot = 0:5, aggre = list(cancer, period, fot)) -#' st <- survtab_ag(fot ~ cancer + period, data = x, -#' surv.method = "lifetable", surv.type = "surv.obs") -#' -#' plot(st, "surv.obs", subset = cancer == "breast", ylim = c(0.5, 1), col = "blue") -#' lines(st, "surv.obs", subset = cancer == "rectal", col = "red") -#' -#' ## or -#' plot(st, "surv.obs", col = c(2,2,4,4), lty = c(1, 2, 1, 2)) -#' @export -#' @family survtab functions -plot.survtab <- function(x, y = NULL, subset=NULL, conf.int=TRUE, col=NULL,lty=NULL, ylab = NULL, xlab = NULL, ...) { - - Tstop <- delta <- NULL ## APPEASE R CMD CHECK - ## prep ---------------------------------------------------------------------- - PF <- parent.frame(1L) - subset <- substitute(subset) - subset <- evalLogicalSubset(data = x, subset, enclos = PF) - - l <- prep_plot_survtab(x = x, y = y, subset = subset, - conf.int = conf.int, enclos = PF) - x <- l$x - y <- l$y - y.ci <- l$y.ci - y.lo <- l$y.lo - y.hi <- l$y.hi - - ## figure out limits, etc. to pass to plot() --------------------------------- - - min_y <- do.call("min", c(mget(c(y, y.lo), as.environment(x)), na.rm = TRUE)) - min_y <- max(min_y, 0) - max_y <- max(x[[y]], na.rm=TRUE) - - if (substr(y, 1, 3) == "CIF") { - min_y <- 0.0 - } else { - max_y <- max(1.0, max_y) - } - - max_x <- max(x[, Tstop]) - min_x <- min(x[, Tstop-delta]) - - if (is.null(ylab)) { - ylab <- "Observed survival" - if (substr(y[1], 1,4) %in% c("r.e2", "r.pp")) ylab <- "Net survival" - if (substr(y[1], 1,4) == "CIF_") ylab <- "Absolute risk" - if (substr(y[1], 1,6) == "CIF.rel") ylab <- "Absolute risk" - } - if (is.null(xlab)) xlab <- "Time from entry" - - ## attributes insurance to pass to lines.survtab - setattr(x, "survtab.meta", l$attrs$survtab.meta) - setattr(x, "class", c("survtab", "data.table", "data.frame")) - - ## plotting ------------------------------------------------------------------ - plot(I(c(min_y,max_y))~I(c(min_x,max_x)), data=x, type="n", - xlab = xlab, ylab = ylab, ...) - - - lines.survtab(x, subset = NULL, y = y, conf.int=conf.int, - col=col, lty=lty, ...) - - -} - - -#' \code{lines} method for survtab objects -#' -#' Plot \code{lines} from a \code{survtab} object -#' -#' @import graphics -#' -#' @author Joonas Miettinen -#' -#' @param x a \code{survtab} output object -#' @param y a variable to plot; a quoted name of a variable -#' in \code{x}; e.g. \code{y = "surv.obs"}; -#' if \code{NULL}, picks last survival variable column in order in \code{x} -#' @param subset a logical condition; \code{obj} is subset accordingly -#' before plotting; use this for limiting to specific strata, -#' e.g. \code{subset = sex == "male"} -#' @param conf.int logical; if \code{TRUE}, also plots any confidence intervals -#' present in \code{obj} for variables in \code{y} -#' @param col line colour passed to \code{matlines} -#' @param lty line type passed to \code{matlines} -#' @param ... additional arguments passed on to to a \code{matlines} call; -#' e.g. \code{lwd} can be defined this way -#' @examples -#' data(sire) -#' data(sibr) -#' si <- rbind(sire, sibr) -#' si$period <- cut(si$dg_date, as.Date(c("1993-01-01", "2004-01-01", "2013-01-01")), right = FALSE) -#' si$cancer <- c(rep("rectal", nrow(sire)), rep("breast", nrow(sibr))) -#' x <- lexpand(si, birth = bi_date, entry = dg_date, exit = ex_date, -#' status = status %in% 1:2, -#' fot = 0:5, aggre = list(cancer, period, fot)) -#' st <- survtab_ag(fot ~ cancer + period, data = x, -#' surv.method = "lifetable", surv.type = "surv.obs") -#' -#' plot(st, "surv.obs", subset = cancer == "breast", ylim = c(0.5, 1), col = "blue") -#' lines(st, "surv.obs", subset = cancer == "rectal", col = "red") -#' -#' ## or -#' plot(st, "surv.obs", col = c(2,2,4,4), lty = c(1, 2, 1, 2)) -#' @export -#' @family survtab functions -lines.survtab <- function(x, y = NULL, subset = NULL, - conf.int = TRUE, col=NULL, lty=NULL, ...) { - Tstop <- NULL ## APPEASE R CMD CHECK - ## prep ---------------------------------------------------------------------- - PF <- parent.frame(1L) - global_breaks <- attr(x, "survtab.meta")$surv.breaks - - subset <- substitute(subset) - subset <- evalLogicalSubset(data = x, subset, enclos = PF) - - - l <- prep_plot_survtab(x = x, y = y, subset = subset, - conf.int = conf.int, enclos = environment()) - x <- l$x - y <- l$y - y.ci <- l$y.ci - y.lo <- l$y.lo - y.hi <- l$y.hi - strata <- l$strata ## character vector of var names - - - ## impute first values (time = 0, surv = 1 / cif = 0) ------------------------ - - is_CIF <- if (substr(y, 1, 3) == "CIF") TRUE else FALSE - setkeyv(x, c(strata, "Tstop")) - first <- x[1, ] - if (length(strata)) first <- unique(x, by = strata) - first[, c(y) := ifelse(is_CIF, 0, 1)] - first$Tstop <- min(global_breaks) - - if (length(y.ci) > 0) first[, (y.ci) := get(y) ] - x <- rbindlist(list(first, x[, ]), use.names = TRUE) - setkeyv(x, c(strata, "Tstop")) - - ## plotting ------------------------------------------------------------------ - - if (is.null(lty)) { - lty <- list(c(1,2,2)) - if (!length(y.ci)) lty <- list(1) - } - - lines_by(x = "Tstop", y = c(y, y.ci), - strata.vars = strata, - data = x, col = col, lty = lty, ...) - - -} - - - -lines_by <- function(x, y, strata.vars = NULL, data, col, lty, ...) { - ## INTENTION: plots lines separately by strata, - ## which may have different colours / linetypes. - ## @param x a variable to plot y by; a character string - ## @param y a character vector of variables to plot by x; - ## e.g. the estimate and confidence interval variables - ## @param strata.vars a character string vector; variables - ## to add lines by, which may have different colours etc for identification - ## @param data a data.frame where x, y, and strata.vars are found - ## @param col a vector of colors passed to lines(); if vector length 1, - ## used for each level of strata. If vector length > 1, - ## has to match to total number of strata. If list, must match - ## to number of strata by length and contain elements of length - ## length(y). - ## @param see col; line type passed to lines(). - ## @param ... other arguments passed on to lines(). - - TF <- environment() - PF <- parent.frame(1L) - - stopifnot(is.data.frame(data)) - stopifnot(is.character(x) && length(x) == 1L) - stopifnot(is.character(y) && length(y) > 0L) - stopifnot(is.character(strata.vars) || is.null(strata.vars)) - all_names_present(data, c(x,y,strata.vars)) - - d <- mget(c(strata.vars, y, x), envir = as.environment(data)) - setDT(d) - setkeyv(d, c(strata.vars, x)) - - ## create list of datas - l <- list(d) - inter <- 1L - if (length(strata.vars)) { - inter <- do.call(interaction, d[, strata.vars, with = FALSE]) - l <- vector("list", uniqueN(inter)) - l <- split(d, f = inter, drop = TRUE) - } - - l <- lapply(l, function(tab) { - setDT(tab) - setcolsnull(tab, keep = c(x, y)) - tab - }) - - - ## figure out colours and ltys - for (objname in c("col", "lty")) { - obj <- TF[[objname]] - - if (missing(obj) || !length(obj)) obj <- 1 - if (!length(obj) %in% c(1, length(l))) { - stop("Argument ", objname, " is not of length 1 or ", - "of length equal to total number of strata (", - length(l), ").") - } - - ol <- unlist(lapply(obj, length)) - if (length(y) > 1 && is.list(obj) && !all(ol %in% c(1, length(y)))) { - stop("Argument y is of length > 1, and you passed ", - objname, " as a list of values, but at least one element is not ", - "of length 1 or length(y).") - } - - ## NOTE: rep works for vector and list just the same - if (length(obj) == 1) obj <- rep(obj, length(l)) - obj <- as.list(obj) - - assign(x = objname, value = obj) - } - - lapply(seq_along(l), function(i) { - - tab <- l[[i]] - cols <- col[[i]] - ltys <- lty[[i]] - - matlines(x = tab[[x]], y = tab[, y, with = FALSE], - col = cols, lty = ltys, ...) - - }) - - invisible(NULL) -} - - - - -#' @title Graphically Inspect Curves Used in Mean Survival Computation -#' @description Plots the observed (with extrapolation) and expected survival -#' curves for all strata in an object created by \code{\link{survmean}} -#' @author Joonas Miettinen -#' @param x a \code{survmean} object -#' @param ... arguments passed (ultimately) to \code{matlines}; you -#' may, therefore, supply e.g. \code{xlab} through this, though arguments -#' such as \code{lty} and \code{col} will not work -#' @details -#' -#' For examples see \code{\link{survmean}}. This function is intended only -#' for graphically inspecting that the observed survival curves with extrapolation -#' and the expected survival curves have been sensibly computed in \code{survmean}. -#' -#' If you want finer control over the plotted curves, extract the curves from -#' the \code{survmean} output using -#' -#' \code{attr(x, "curves")} -#' -#' where \code{x} is a \code{survmean} object. -#' @export -#' @family survmean functions -plot.survmean <- function(x, ...) { - at <- attr(x, "survmean.meta") - curves <- at$curves - if (is.null(curves)) { - stop("no curves information in x; sometimes lost if x ", - "altered after using survmean") - } - - by.vars <- at$tprint - by.vars <- c(by.vars, at$tadjust) - by.vars <- intersect(by.vars, names(curves)) - if (!length(by.vars)) by.vars <- NULL - - plot(curves$surv ~ curves$Tstop, type="n", - xlab = "Time from entry", ylab = "Survival") - lines.survmean(x, ...) - - subr <- at$breaks[[at$survScale]] - abline(v = max(subr), lty=2, col="grey") - - if (length(by.vars)) { - ## add legend denoting colors - Stratum <- curves[, unique(interaction(.SD)), .SDcols = eval(by.vars)] - legend(x = "topright", legend = Stratum, col = seq_along(Stratum), lty = 1) - } - -} - -#' @title Graphically Inspect Curves Used in Mean Survival Computation -#' @description Plots the observed (with extrapolation) and expected survival -#' curves for all strata in an object created by \code{\link{survmean}} -#' @author Joonas Miettinen -#' @param x a \code{survmean} object -#' @param ... arguments passed (ultimately) to \code{matlines}; you -#' may, therefore, supply e.g. \code{lwd} through this, though arguments -#' such as \code{lty} and \code{col} will not work -#' @details -#' -#' This function is intended to be a workhorse for \code{\link{plot.survmean}}. -#' If you want finer control over the plotted curves, extract the curves from -#' the \code{survmean} output using -#' -#' \code{attr(x, "curves")} -#' -#' where \code{x} is a \code{survmean} object. -#' @export -#' @family survmean functions -lines.survmean <- function(x, ...) { - at <- copy(attr(x, "survmean.meta")) - curves <- at$curves - if (is.null(curves)) stop("no curves information in x; usually lost if x altered after using survmean") - - by.vars <- at$tprint - by.vars <- c(by.vars, at$tadjust) - by.vars <- c("survmean_type", by.vars) - by.vars <- intersect(by.vars, names(curves)) - if (!length(by.vars)) by.vars <- NULL - - curves <- data.table(curves) - setkeyv(curves, c(by.vars, "Tstop")) - - type_levs <- length(levels(interaction(curves[, c(by.vars), with=FALSE])))/2L - other_levs <- 1L - if (length(by.vars) > 1) { - other_levs <- length(levels(interaction(curves[, setdiff(by.vars, "survmean_type"), with=FALSE]))) - } - - curves <- cast_simple(curves, columns = by.vars, rows = "Tstop", values = "surv") - matlines(x=curves$Tstop, y=curves[, setdiff(names(curves), "Tstop"), with=FALSE], - lty = rep(1:2, each=type_levs), col = 1:other_levs, ...) -} - - - - - - - -#' @export -getCall.survtab <- function(x, ...) { - attributes(x)$survtab.meta$call -} - - -#' @export -formula.survtab <- function(x, ...) { - attr(x, "survtab.meta")$arguments$formula -} - - - - - -#' @export -getCall.survmean <- function(x, ...) { - attributes(x)$survmean.meta$call -} - - -#' @export -formula.survmean <- function(x, ...) { - attr(x, "survmean.meta")$formula -} - - - + + +#' @export +print.sir <- function(x, subset = NULL, ...) { + + at <- attributes(x)$sir.meta + PF <- parent.frame(1L) + subset <- evalLogicalSubset(x, substitute(subset), enclos = PF) + x <- x[subset, ] + setDT(x) + + t1 <- paste0("SIR (adjusted by ", paste(at$adjust, collapse = ', '),')', + ' with ', at$conf.level*100, '% ', 'confidence intervals (', at$conf.type,')') + + + # cat + t3 <- paste0(' Total sir: ', round(at$pooled.sir$sir,2),' (', + round(at$pooled.sir$sir.lo,2),'-', round(at$pooled.sir$sir.hi, 2),')\n', + ' Total observed: ', at$pooled.sir$observed, '\n', + ' Total expected: ', round(at$pooled.sir$expected,2), '\n', + ' Total person-years: ', round(at$pooled.sir$pyrs)) + + rv <- intersect(names(x), c('sir','sir.lo','sir.hi','observed','expected','pyrs')) + if (length(rv)) { + x[, (rv) := lapply(.SD, round, digits = 2L), .SDcols = rv] + } + + rv <- intersect(names(x), c('p_value')) + if (length(rv)) { + x[, (rv) := lapply(.SD, round, digits = 4), .SDcols = rv] + } + + + if(is.null(at$lrt.test)) { + d <- paste("Could not test", at$lrt.test.type) + } else { + if(at$lrt.test.type == 'homogeneity') { + d <- paste("Test for homogeneity: p", p.round( c(at$lrt.test))) + } + if(at$lrt.test.type == 'trend') { + d <- paste("Test for trend: p", p.round( c(at$lrt.test))) + } + } + #b <- round(c(ta$total$sir, ta$total$sir.lo, ta$total$sir.hi), 2) + # cat('\n',"Total observed", ta$total$observed, '\n', + # "Total expected:", ta$total$expected, '\n', + # "SIR:", paste0(b[1], ' (',b[2], '-',b[3],')'), '\n', + # "Person-years:", ta$total$pyrs, '\n', + # fill=TRUE) + + cat(t1, '\n') + if(x[,.N] > 1) { + cat(d, '\n') + } + cat(fill=TRUE) + cat(t3, '\n', fill=TRUE) + + print(data.table(x), ...) + return(invisible()) +} + +#' @export +`[.sir` <- function(x, ...) { + y <- NextMethod() + if (is.data.frame(y)) { + setattr(y, "class", class(x)) + setattr(y, "sir.meta", attr(x, "sir.meta")) + } + y +} + + +#' @import grDevices +#' @export +print.sirspline <- function(x, ...) { + if ( x$spline.dependent ) { + if( any( !is.na(x$p.values))) { + cat( 'global p-value:', p.round(x$p.values[1]),'\n' ) + cat( 'level p-value:', p.round(x$p.values[2]) , fill= TRUE) + } else { + cat( 'No models compared.', fill= TRUE) + } + cat('---', '\n') + cat('Colour codes:', '\n', fill=TRUE) + } else { + + for(i in 1:length(x$p.values)) { + cat( x$spline[i] ,': p ', p.round( x$p.values[[i]] ), '\n', sep = '') + } + cat(fill=TRUE) + + } + # Print colour codes: + cols <- unique(x$spline.est.A[,1]) + col.length <- length(cols) + print( data.frame(levels = cols, colour = palette()[1:col.length]), include.rownames = FALSE) + + # Print p-values + return(invisible()) +} + + +#' Plot method for sir-object +#' +#' Plot SIR estimates with error bars +#' +#' @seealso \code{\link{sir}}, \code{\link{sirspline}} +#' +#' @import graphics +#' +#' @author Matti Rantanen +#' +#' @param x an object returned by function \code{sir} +#' @param conf.int default TRUE draws confidence intervals +#' @param xlab overwrites default x-axis label +#' @param ylab overwrites default y-axis label +#' @param xlim x-axis minimum and maximum values +#' @param main optional plot title +#' @param abline logical; draws a grey line in SIR = 1 +#' @param log logical; SIR is not in log scale by default +#' @param eps error bar vertical bar height (works only in 'model' or 'univariate') +#' @param left.margin adjust left marginal of the plot to fit long variable names +#' @param ... arguments passed on to plot(), segment and lines() +#' +#' +#' @details Plot SIR estimates and confidence intervals +#' \itemize{ +#' \item univariate - plots SIR with univariate confidence intervals +#' \item model - plots SIR with Poisson modelled confidence intervals +#' } +#' +#' \strong{Customize} +#' Normal plot parameters can be passed to \code{plot}. These can be a vector when plotting error bars: +#' \itemize{ +#' \item \code{pch} - point type +#' \item \code{lty} - line type +#' \item \code{col} - line/point colour +#' \item \code{lwd} - point/line size +#' } +#' +#' \strong{Tips for plotting splines} +#' It's possible to use \code{plot} to first draw the +#' confidence intervals using specific line type or colour and then plotting +#' again the estimate using \code{lines(... , conf.int = FALSE)} with different +#' settings. This works only when \code{plot.type} is 'splines'. +#' +#' +#' @examples +#' \dontrun{ +#' # Plot SIR estimates +#'# plot(sir.by.gender, col = c(4,2), log=FALSE, eps=0.2, lty=1, lwd=2, pch=19, +#'# main = 'SIR by gender', abline=TRUE) +#' } +#' @export + +plot.sir <- function(x, conf.int = TRUE, ylab, xlab, xlim, main, + eps=0.2, abline = TRUE, log = FALSE, left.margin, ...) { + + a <- data.table(x) + at <- attributes(x)$sir.meta + level_names <- at$print + + if(is.null(level_names)) { + levels <- 'Crude' + level_names <- levels + } + else { + q <- paste0('paste(', paste(level_names, collapse=', '),', sep = ":")' ) + q <- parse(text = q) + levels <- a[, eval(q)] + } + + # predefined parameters + if( missing(main) ){ + main <- NA + } + if( missing(xlab) ){ + xlab <- 'SIR' + } + if( missing(ylab) ){ + ylab <- NA + } + if( missing(xlim) ) { + xlimit <- c(min(a$sir.lo[a$sir.lo 1) { + save_par <- par(no.readonly = TRUE) + par(mfrow=c(1,sum(plotdim))) + type <- 'l' + } + + ## set labels + if ( missing(xlab) ) { + xlab <- x$spline + } + + if ( missing(ylab) ) { + ylab <- rep('SIR',sum(plotdim)) + if(log){ + ylab <- rep('log(SIR)', sum(plotdim)) + } + if(x$spline.dependent & sum(plotdim) > 1) { + ylab <- c(ylab[1], paste(ylab[2:sum(plotdim)], 'ratio')) + } + } + else{ + if( length(ylab) < sum(plotdim)) + ylab <- rep(ylab, sum(plotdim)) + if(length(ylab) > sum(plotdim)) { + warning('set ylabs in a vector length of num of plots (',sum(plotdim),')') + } + } + + ## set scale + if(!is.logical(log)) stop('log should be a logical value.') + log.bin <- ifelse(log, 'y', '') + + ## remove infinite values + #rm_inf <- function(est){ + # x[[est]][ is.finite(x[[est]][[2]]) & is.finite(x[[est]][[3]]) & is.finite(x[[est]][[4]]), ] + #} + + spl <- c('spline.seq.A', 'spline.seq.B', 'spline.seq.C')[1:sum(plotdim)] + est <- gsub("seq", "est", spl) + + for (i in 1:sum(plotdim)) { # age, per, fot, + # empty plot + max_x <- range(x[[spl[i]]]) + max_y <- range( x[[est[i]]][, 2:4] ) + plot(max_x, max_y, type = 'n', ylab = ylab[i], xlab = xlab[i], log = log.bin, ...) + if(abline) abline(h = 1) + + # plot lines + if (missing(type) || type != 'n') { + lines.sirspline(x, conf.int = conf.int, select.spline = i, ...) + } + } + if(sum(plotdim) > 1) { + par(save_par) + } + return(invisible()) +} + + + +#' @title lines method for sirspline-object +#' @description Plot SIR spline lines with R base graphics +#' +#' +#' @author Matti Rantanen +#' +#' @param x an object returned by function sirspline +#' @param conf.int logical; default TRUE draws also the 95 confidence intervals +#' @param print.levels name(s) to be plotted. Default plots all levels. +#' @param select.spline select which spline variable (a number or a name) is plotted. +#' @param ... arguments passed on to lines() +#' +#' @details In \code{lines.sirspline} most of graphical parameters is user +#' adjustable. +#' Desired spline variable can be selected with \code{select.spline} and only one +#' can be plotted at a time. The spline variable can include +#' several levels, e.g. gender (these are the levels of \code{print} +#' from \code{sirspline}). All levels are printed by default, but a +#' specific level can be selected using argument +#' \code{print.levels}. Printing the levels separately enables e.g. to +#' give different colours for each level. +#' +#' @family sir functions +#' +#' @import graphics +#' @export +lines.sirspline <- function(x, conf.int = TRUE, print.levels = NA, select.spline, ... ){ + ## input: sirspline object, with only one spline var (spline.est.A) + ## input: print levels can be > 1. + + ## subset splines + if( length(x$spline) > 1 ) { + if ( missing(select.spline) ) { + stop(paste('select what spline to plot in select.spline:', paste(x$spline, collapse = ', '))) + } + else { + if(is.numeric(select.spline)) { + k <- select.spline + } + else { + k <- which(x$spline == select.spline) + } + if(length(k) == 0 | length(x$spline) < k) stop('select.spline name/number is incorrect') + } + } + else { + k <- 1 + } + + spl <- c('spline.seq.A', 'spline.seq.B', 'spline.seq.C')[k] + est <- gsub("seq", "est", spl) + + ## remove infinite values + # x[[h]] <- rm_inf(est=h) + + # get print levels + if(missing(print.levels)) { + print.levels <- NA + } + pl <- unique(x$spline.est.A[,1]) + if(any( is.null(print.levels), is.na(print.levels))) { + print.levels <- pl + } + pl <- pl[ pl %in% print.levels] + + ## get conf.int + if( !is.logical(conf.int) ) stop('conf.int is not logical') + n <- c(2,4)[c(!conf.int, conf.int)] + + + ## draw lines + for( l in pl ){ + # loop through print.levels + index <- which(x$spline.est.A$i == l) + + for(m in 2:n) { + # loop through estiamte and confidence intervals + lines(x = x[[spl]], y = x[[est]][index, m], ...) + } + } +} + +#' @title Print an rate object +#' @author Matti Rantanen +#' @description Print method function for \code{rate} objects; see +#' \code{\link{rate}}. +#' @param x an \code{rate} object +#' @param subset a logical condition to subset results table by +#' before printing; use this to limit to a certain stratum. E.g. +#' \code{subset = sex == "female"} +#' @param ... arguments for data.tables print method, e.g. row.names = FALSE suppresses row numbers. +#' @export +print.rate <- function(x, subset = NULL, ...) { + + ra <- attributes(x)$rate.meta + PF <- parent.frame(1L) + TF <- environment() + subset <- evalLogicalSubset(x, substitute(subset), enclos = PF) + x <- x[subset, ] + + # pre texts: + cat('\n') + if(!is.null(ra$adjust)){ + if(is.character(ra$weights)) { + a <- paste(ra$weights, collapse = ',') + } + if(all(is.numeric(ra$weights))) { + a <- length(ra$weights) + } + if(is.list(ra$weights)) { + a <- sapply(ra$weights, length) + } + + b <- paste(ra$adjust,a, collapse = ', ', sep = '; ') + cat('Adjusted rates (', b,') ', sep = '') + } + else{ + cat('Crude rates ') + } + cat('and', '95%', 'confidence intervals:', fill=TRUE) + cat('\n') + # table itself + + + setDT(x) + print(x, ...) +} + + +#' @title plot method for rate object +#' @description Plot rate estimates with confidence intervals lines using R base graphics +#' @author Matti Rantanen +#' +#' @param x a rate object (see \code{\link{rate}}) +#' @param conf.int logical; default TRUE draws the confidence intervals +#' @param eps is the height of the ending of the error bars +#' @param left.margin set a custom left margin for long variable names. Function +#' tries to do it by default. +#' @param xlim change the x-axis location +#' @param ... arguments passed on to graphical functions points and segment +#' (e.g. \code{col}, \code{lwd}, \code{pch} and \code{cex}) +#' +#' @details This is limited explanatory tool but most graphical +#' parameters are user adjustable. +#' +#' @import graphics +#' @export +plot.rate <- function(x, conf.int = TRUE, eps = 0.2, left.margin, xlim, ...) { + + ra <- attributes(x)$rate.meta + varcol <- ra$print + + if(is.null(varcol)) { + lvl.name <- 'Crude' + } + else { + pp <- paste0('paste(', paste(varcol, collapse=','),',sep = ":")') + q <- parse(text=pp) + lvl.name <- x[,eval(q)] + } + lvls <- 1:length(lvl.name) + + # WHICH RATE: + if('rate.adj' %in% names(x)) { + r <- x$rate.adj + hi <- x$rate.adj.hi + lo <- x$rate.adj.lo + } + else { + r <- x$rate + hi <- x$rate.hi + lo <- x$rate.lo + } + # X-AXIS LIMITs + if(missing(xlim)) { + t <- range(na.omit(c(lo , r, hi))) + t0 <- (t[2]-t[1])/4 + xlimit <- c(pmax(t[1]-t0, 0), t[2] + t0) + } + else { + xlimit <- xlim + } + + # MARGINS + if(missing(left.margin)) { + old.margin <- new.margin <- par("mar") + new.margin[2] <- 4.1 + sqrt( max(nchar(as.character(lvl.name))) )*2 + } + else { + new.margin[2] <- left.margin + } + par(mar = new.margin) + + plot(c(xlimit), c(min(lvls)-0.5, max(lvls)+0.5), type='n', yaxt = 'n', ylab = '', xlab='') + axis(side = 2, at = lvls, labels = lvl.name, las = 1) + points(r, lvls, ...) + + if(conf.int) { + segments(lo, lvls, hi, lvls, ...) + segments(lo, lvls - eps, lo, lvls + eps, ...) + segments(hi, lvls - eps, hi, lvls + eps, ...) + } + par(mar = old.margin) # revert margins +} + + +#' @export +print.yrs <- function(x, ...) { + print(as.numeric(x)) +} + + +#' @export +`[.yrs` <- function(x, ...) { + yl <- attr(x, "year.length") + structure(NextMethod(), year.length = yl, class = c("yrs", "numeric")) +} + + +#' @export +`[.aggre` <- function(x, ...) { + xa <- attributes(x) + y <- NextMethod() + if (is.data.frame(y)) { + setattr(y, "class", xa$class) + setattr(y, "aggre.meta", xa$aggre.meta) + setattr(y, "breaks", xa$breaks) + } + y +} + +#' @export +subset.aggre <- function(x, ...) { + y <- NextMethod() + if (is.data.frame(y)) { + setattr(y, "class", class(x)) + setattr(y, "aggre.meta", attr(x, "aggre.meta")) + setattr(y, "breaks", attr(x, "breaks")) + } + y +} + +preface_survtab.print <- function(x) { + surv.int <- NULL ## APPEASE R CMD CHECK + at <- attributes(x)$survtab.meta + arg <- at$arguments + + cat("\n") + cat("Call: \n", oneWhitespace(deparse(at$call)), "\n") + cat("\n") + cat("Type arguments: \n surv.type:", as.character(arg$surv.type), + "--- surv.method:", as.character(arg$surv.method)) + if (as.character(arg$surv.type) == "surv.rel") + cat(" --- relsurv.method:", as.character(arg$relsurv.method)) + cat("\n \n") + cat("Confidence interval arguments: \n level:", + as.character(arg$conf.level*100), "%") + cat(" --- transformation:", + as.character(arg$conf.type)) + cat("\n \n") + cat("Totals:") + totCat <- paste0("\n person-time:", round(sum(x$pyrs))) + if (arg$surv.method == "lifetable") { + totCat <- paste0("\n at-risk at T=0: ", round(sum(x[surv.int == 1L]$n))) + } + + cat(totCat) + cat(" --- events:", sum(x$d)) + cat("\n \n") + if (length(at$print.vars) > 0L) { + cat("Stratified by:", paste0("'", at$print.vars, "'", collapse = ", ")) + if (length(at$adjust.vars) > 0L) cat(" --- ") + } + if (length(at$adjust.vars) > 0L) { + cat("Adjusted by:", paste0("'", at$adjust.vars, "'", collapse = ", ")) + } + cat("\n") + invisible() +} + + +#' @title Print an \code{aggre} Object +#' @author Joonas Miettinen +#' @description Print method function for \code{aggre} objects; see +#' \code{\link{as.aggre}} and \code{\link{aggre}}. +#' @param x an \code{aggre} object +#' @param subset a logical condition to subset results table by +#' before printing; use this to limit to a certain stratum. E.g. +#' \code{subset = sex == "male"} +#' @param ... arguments passed to \code{print.data.table}; try e.g. +#' \code{top = 2} for numbers of rows in head and tail printed +#' if the table is large, +#' \code{nrow = 100} for number of rows to print, etc. +#' @export +print.aggre <- function(x, subset = NULL, ...) { + + PF <- parent.frame(1L) + TF <- environment() + sa <- attributes(x)$aggre.meta + + subset <- evalLogicalSubset(x, substitute(subset), enclos = PF) + x <- x[subset, ] + setDT(x) + + print(x, ...) + +} + +#' @title Summarize an \code{aggre} Object +#' @author Joonas Miettinen +#' @description \code{summary} method function for \code{aggre} objects; see +#' \code{\link{as.aggre}} and \code{\link{aggre}}. +#' @param object an \code{aggre} object +#' @param by list of columns to summarize by - e.g. \code{list(V1, V2)} +#' where \code{V1} and \code{V2} are columns in the data. +#' @param subset a logical condition to subset results table by +#' before summarizing; use this to limit to a certain stratum. E.g. +#' \code{subset = sex == "male"} +#' @param ... unused +#' @export +#' @family aggregation functions +summary.aggre <- function(object, by = NULL, subset = NULL, ...) { + + PF <- parent.frame(1L) + TF <- environment() + x <- object + sa <- attributes(x)$aggre.meta + + subset <- evalLogicalSubset(x, substitute(subset), enclos = PF) + x <- x[subset, ] + setDT(x) + + bys <- substitute(by) + bye <- evalPopArg(x, bys, enclos = environment(), types = c("list", "NULL")) + + vals <- sa$values + vals <- intersect(names(x), vals) + if (!length(vals)) { + cat("No originally created value columns appear to be left in data.") + } + r <- x[, lapply(.SD, sum), by = eval(bye), .SDcols = vals] + r +} + +#' @title Print a survtab Object +#' @author Joonas Miettinen +#' @description Print method function for \code{survtab} objects; see +#' \code{\link{survtab_ag}}. +#' @param x a \code{survtab} object +#' @param subset a logical condition to subset results table by +#' before printing; use this to limit to a certain stratum. E.g. +#' \code{subset = sex == "male"} +#' @param ... arguments passed to \code{print.data.table}; try e.g. +#' \code{top = 2} for numbers of rows in head and tail printed +#' if the table is large, +#' \code{nrow = 100} for number of rows to print, etc. +#' @export +#' @family survtab functions +print.survtab <- function(x, subset = NULL, ...) { + + Tstart <- Tstop <- NULL ## APPEASE R CMD CHECK + + PF <- parent.frame(1L) + TF <- environment() + sa <- attributes(x)$survtab.meta + + subset <- evalLogicalSubset(x, substitute(subset), enclos = PF) + x <- x[subset, ] + + preface_survtab.print(x) + + setDT(x) + + if (nrow(x) == 0L) { + print(x) + return(invisible()) + } + + pv <- as.character(sa$print.vars) + if (length(pv) == 0L) pv <- NULL + + magicMedian <- function(x) { + if (length(x) %% 2L == 0L) median(x[-1L], na.rm = TRUE) else + median(x, na.rm = TRUE) + } + + ## to avoid e.g. 'factor(V1, 1:2)' going bonkers + pv_orig <- pv + if (length(pv) > 0L) { + pv <- makeTempVarName(x, pre = paste0("print_", 1:length(pv))) + setnames(x, pv_orig, pv) + } + + medmax <- x[, list(Tstop = c(magicMedian(c(min(Tstart),Tstop)), max(Tstop))), keyby = eval(pv)] + + setkeyv(medmax, c(pv, "Tstop")) + setkeyv(x, c(pv, "Tstop")) + x <- x[medmax] + + rv <- intersect(names(x), c(sa$est.vars, sa$CI.vars, sa$misc.vars)) + if (length(rv)) { + x[, (rv) := lapply(.SD, round, digits = 4L), .SDcols = rv] + } + + sv <- intersect(names(x), sa$SE.vars) + if (length(sv > 0L)) { + x[, c(sv) := lapply(.SD, signif, digits = 4L), .SDcols = sv] + } + + + setcolsnull(x, keep = c(pv, "Tstop", sa$surv.vars), colorder = TRUE) + if (length(pv)) setnames(x, pv, pv_orig) + print(data.table(x), ...) + invisible() +} + +#' @title Summarize a survtab Object +#' @author Joonas Miettinen +#' @description Summary method function for \code{survtab} objects; see +#' \code{\link{survtab_ag}}. Returns estimates at given time points +#' or all time points if \code{t} and \code{q} are both \code{NULL}. +#' @param object a \code{survtab} object +#' @param t a vector of times at which time points (actually intervals that +#' contain t) to print summary table of survival function estimates by strata; +#' values not existing in any interval cause rows containing only \code{NAs} to +#' be returned. +#' @param q a named \code{list} of quantiles to include in returned data set, +#' where names must match to estimates in \code{object}; +#' returns intervals where the quantiles are reached first; +#' e.g. \code{list(surv.obs = 0.5)} finds the interval where \code{surv.obs} +#' is 0.45 and 0.55 at the beginning and end of the interval, respectively; +#' returns rows with \code{NA} values for quantiles not reached in estimates +#' (e.g. if \code{q = list(surv.obs = 0.5)} but lowest estimate is 0.6); +#' see Examples. +#' @param subset a logical condition to subset results table by +#' before printing; use this to limit to a certain stratum. E.g. +#' \code{subset = sex == "male"} +#' @param ... unused; required for congruence with other \code{summary} methods +#' +#' @details +#' Note that this function returns the intervals and NOT the time points +#' corresponding to quantiles / estimates corresponding to time points. +#' If you want precise estimates at time points that are not interval breaks, +#' add the time points as breaks and re-estimate the survival time function. +#' In interval-based estimation, the estimates denote e.g. probability of +#' dying \emph{during} the interval, so time points within the intervals +#' are not usually considered at all. See e.g. Seppa, Dyba, and Hakulinen +#' (2015). +#' +#' @references +#' Seppa K., Dyba T. and Hakulinen T.: Cancer Survival, +#' Reference Module in Biomedical Sciences. Elsevier. 08-Jan-2015 +#' doi: 10.1016/B978-0-12-801238-3.02745-8. +#' +#' @examples +#' +#' library(Epi) +#' library(survival) +#' +#' ## NOTE: recommended to use factor status variable +#' x <- Lexis(entry = list(FUT = 0, AGE = dg_age, CAL = get.yrs(dg_date)), +#' exit = list(CAL = get.yrs(ex_date)), +#' data = sire[sire$dg_date < sire$ex_date, ], +#' exit.status = factor(status, levels = 0:2, +#' labels = c("alive", "canD", "othD")), +#' merge = TRUE) +#' ## pretend some are male +#' set.seed(1L) +#' x$sex <- rbinom(nrow(x), 1, 0.5) +#' ## observed survival +#' st <- survtab(Surv(time = FUT, event = lex.Xst) ~ sex, data = x, +#' surv.type = "cif.obs", +#' breaks = list(FUT = seq(0, 5, 1/12))) +#' +#' ## estimates at full years of follow-up +#' summary(st, t = 1:5) +#' +#' ## interval estimate closest to 75th percentile, i.e. +#' ## first interval where surv.obs < 0.75 at end +#' ## (just switch 0.75 to 0.5 for median survival, etc.) +#' summary(st, q = list(surv.obs = 0.75)) +#' ## multiple quantiles +#' summary(st, q = list(surv.obs = c(0.75, 0.90), CIF_canD = 0.20)) +#' +#' ## if you want all estimates in a new data.frame, you can also simply do +#' +#' x <- as.data.frame(st) +#' +#' @export +#' @family survtab functions +summary.survtab <- function(object, t = NULL, subset = NULL, q = NULL, ...) { + + PF <- parent.frame(1L) + at <- copy(attr(object, "survtab.meta")) + subr <- copy(at$surv.breaks) + + if (!is.null(t) && !is.null(q)) { + stop("Only supply either t or q.") + } + + sb <- substitute(subset) + subset <- evalLogicalSubset(object, sb, enclos = PF) + x <- object[subset, ] + + ## to avoid e.g. 'factor(V1, 1:2)' going bonkers + pv_orig <- pv <- at$print.vars + if (length(pv) > 0L) { + pv <- makeTempVarName(x, pre = paste0("print_", 1:length(pv))) + setnames(x, pv_orig, pv) + } + + setDT(x) + + ## quantile detection -------------------------------------------------------- + if (!is.null(q)) { + bn <- setdiff(names(q), at$est.vars) + if (length(bn) > 0L) { + stop("No survival time function estimates named ", + paste0("'", bn, "'", collapse = ", "), + " found in supplied survtab object. Available ", + "survival time function estimates: ", + paste0("'", at$est.vars, "'", collapse = ", ")) + } + + lapply(q, function(x) { + if (min(x <= 0L) || max(x >= 1L)) { + stop("Quantiles must be expressed as numbers between 0 and 1, ", + "e.g. surv.obs = 0.5.") + } + }) + + + m <- x[, .SD[1, ], keyby = eval(pv)][, c(pv, "Tstop"), with = FALSE] + setDF(m) + + rollVars <- makeTempVarName(x, pre = names(q)) + x[, c(rollVars) := lapply(.SD, copy), .SDcols = names(q)] + + m <- lapply(seq_along(q), function(i) { + m <- merge(m, q[[i]]) + setnames(m, "y", rollVars[i]) + if (length(pv)) setorderv(m, pv) + m[, c(pv, rollVars[i]), drop = FALSE] + }) + names(m) <- names(q) + + l <- vector("list", length(q)) + names(l) <- names(q) + for (k in names(q)) { + + l[[k]] <- setDT(x[m[[k]], on = names(m[[k]]), roll = 1L]) + + } + l <- rbindlist(l) + set(l, j = rollVars, value = NULL) + if (length(pv)) setkeyv(l, pv) + x <- l + } + + ## time point detection ------------------------------------------------------ + + if (!is.null(t)) { + + tcutv <- makeTempVarName(x, pre = "cut_time_") + + set(x, j = tcutv, value = cut(x$Tstop, breaks = subr, right = T, + include.lowest = F)) + cutt <- cut(t, breaks = subr, right = T, include.lowest = F) + + l <- list(cutt) + names(l) <- tcutv + if (length(pv)) { + pvdt <- setDF(unique(x, by = pv))[, pv, drop = FALSE] + l <- setDT(merge(pvdt, as.data.frame(l))) + setkeyv(l, pv) + } + + x <- x[l, on = c(pv, tcutv)] + + set(x, j = tcutv, value = NULL) + if (length(pv)) setkeyv(x, pv) + } + + + ## final touches ------------------------------------------------------------- + if (length(pv) > 0L) setnames(x, pv, pv_orig) + + if (!return_DT()) setDFpe(x) + + x +} + +#' @export +`[.survtab` <- function(x, ...) { + y <- NextMethod() + if (is.data.frame(y)) { + setattr(y, "class", class(x)) + setattr(y, "survtab.meta", attr(x, "survtab.meta")) + } + y +} + +#' @export +subset.survtab <- function(x, ...) { + y <- NextMethod() + if (is.data.frame(y)) { + setattr(y, "class", class(x)) + setattr(y, "survtab.meta", attr(x, "survtab.meta")) + } + y +} + + +#' @export +`[.survmean` <- function(x, ...) { + y <- NextMethod() + if (is.data.frame(y)) { + setattr(y, "class", class(x)) + setattr(y, "survmean.mean", attr(x, "survmean.mean")) + } + y +} + + +#' @export +subset.survmean <- function(x, ...) { + y <- NextMethod() + if (is.data.frame(y)) { + setattr(y, "class", class(x)) + setattr(y, "survmean.mean", attr(x, "survmean.mean")) + } + y +} + +#' @export +`[.rate` <- function(x, ...) { + y <- NextMethod() + if (is.data.frame(y)) { + setattr(y, "class", class(x)) + setattr(y, "rate.meta", attr(x, "rate.meta")) + } + y +} + +#' @export +subset.rate <- function(x, ...) { + y <- NextMethod() + if (is.data.frame(y)) { + setattr(y, "class", class(x)) + setattr(y, "rate.meta", attr(x, "rate.meta")) + } + y +} + + + + + +prep_plot_survtab <- function(x, + y = NULL, + subset = NULL, + conf.int = TRUE, + enclos = parent.frame(1L), + ...) { + + ## subsetting ---------------------------------------------------------------- + subset <- evalLogicalSubset(data = x, substiset = substitute(subset), + enclos = environment()) + + attrs <- attributes(x) + + if (!inherits(x, "survtab")) stop("x is not a survtab object") + if (is.null(attrs$survtab.meta)) { + stop("Missing meta information (attributes) in survtab object; ", + "have you tampered with it after estimation?") + } + strata.vars <- attrs$survtab.meta$print.vars + x <- copy(x) + setDT(x) + x <- x[subset, ] + + ## detect survival variables in data ----------------------------------------- + surv_vars <- c("surv.obs","CIF.rel","CIF_","r.e2","r.pp") + wh <- NULL + for (k in surv_vars) { + wh <- c(wh, which(substr(names(x), 1, nchar(k)) == k)) + } + surv_vars <- names(x)[wh] + surv_vars <- surv_vars[!substr(surv_vars, nchar(surv_vars)-1, nchar(surv_vars)) %in% c("hi","lo")] + if (length(surv_vars) == 0) { + stop("x does not appear to have any survival variables; ", + "did you tamper with it after estimation?") + } + + + ## getting y ----------------------------------------------------------------- + if (!is.null(y)) { + if (!is.character(y)) { + stop("please supply y as a character string indicating ", + "the name of a variable in x") + } + if (length(y) > 1) stop("y must be of length 1 or NULL") + if (!all_names_present(x, y, stops = FALSE)) { + stop("Given survival variable in argument 'y' ", + "not present in survtab object ('", y, "')") + } + } else { + y <- surv_vars[length(surv_vars)] + if (length(surv_vars) > 1L) message("y was NULL; chose ", y, " automatically") + } + rm(surv_vars) + + if (substr(y, 1, 3) == "CIF" && conf.int) { + stop("No confidence intervals currently supported for CIFs. ", + "Hopefully they will be added in a future version; ", + "meanwhile use conf.int = FALSE when plotting CIFs.") + } + + + ## confidence intervals ------------------------------------------------------ + y.lo <- y.hi <- y.ci <- NULL + if (conf.int) { + + y.lo <- paste0(y, ".lo") + y.hi <- paste0(y, ".hi") + y.ci <- c(y.lo, y.hi) + + badCIvars <- setdiff(y.ci, names(x)) + if (sum(length(badCIvars))) { + stop("conf.int = TRUE, but missing confidence interval ", + "variables in data for y = '", y, "' (could not detect ", + "variables named", paste0("'", badCIvars, "'", collapse = ", ") ,")") + } + + } + + list(x = x, y = y, y.ci = y.ci, y.lo = y.lo, y.hi = y.hi, + strata = strata.vars, attrs = attrs) + +} + + + + + +#' \code{plot} method for survtab objects +#' +#' Plotting for \code{survtab} objects +#' +#' @import graphics +#' +#' @author Joonas Miettinen +#' +#' @param x a \code{survtab} output object +#' @param y survival a character vector of a variable names to plot; +#' e.g. \code{y = "r.e2"} +#' @param subset a logical condition; \code{obj} is subset accordingly +#' before plotting; use this for limiting to specific strata, +#' e.g. \code{subset = sex == "male"} +#' @param conf.int logical; if \code{TRUE}, also plots any confidence intervals +#' present in \code{obj} for variables in \code{y} +#' @param col line colour; one value for each stratum; will be recycled +#' @param lty line type; one value for each stratum; will be recycled +#' @param ylab label for Y-axis +#' @param xlab label for X-axis +#' @param ... additional arguments passed on to \code{plot} and +#' \code{lines.survtab}; e.g. \code{ylim} can be defined this way +#' @examples +#' data(sire) +#' data(sibr) +#' si <- rbind(sire, sibr) +#' si$period <- cut(si$dg_date, as.Date(c("1993-01-01", "2004-01-01", "2013-01-01")), right = FALSE) +#' si$cancer <- c(rep("rectal", nrow(sire)), rep("breast", nrow(sibr))) +#' x <- lexpand(si, birth = bi_date, entry = dg_date, exit = ex_date, +#' status = status %in% 1:2, +#' fot = 0:5, aggre = list(cancer, period, fot)) +#' st <- survtab_ag(fot ~ cancer + period, data = x, +#' surv.method = "lifetable", surv.type = "surv.obs") +#' +#' plot(st, "surv.obs", subset = cancer == "breast", ylim = c(0.5, 1), col = "blue") +#' lines(st, "surv.obs", subset = cancer == "rectal", col = "red") +#' +#' ## or +#' plot(st, "surv.obs", col = c(2,2,4,4), lty = c(1, 2, 1, 2)) +#' @export +#' @family survtab functions +plot.survtab <- function(x, y = NULL, subset=NULL, conf.int=TRUE, col=NULL,lty=NULL, ylab = NULL, xlab = NULL, ...) { + + Tstop <- delta <- NULL ## APPEASE R CMD CHECK + ## prep ---------------------------------------------------------------------- + PF <- parent.frame(1L) + subset <- substitute(subset) + subset <- evalLogicalSubset(data = x, subset, enclos = PF) + + l <- prep_plot_survtab(x = x, y = y, subset = subset, + conf.int = conf.int, enclos = PF) + x <- l$x + y <- l$y + y.ci <- l$y.ci + y.lo <- l$y.lo + y.hi <- l$y.hi + + ## figure out limits, etc. to pass to plot() --------------------------------- + + min_y <- do.call("min", c(mget(c(y, y.lo), as.environment(x)), na.rm = TRUE)) + min_y <- max(min_y, 0) + max_y <- max(x[[y]], na.rm=TRUE) + + if (substr(y, 1, 3) == "CIF") { + min_y <- 0.0 + } else { + max_y <- max(1.0, max_y) + } + + max_x <- max(x[, Tstop]) + min_x <- min(x[, Tstop-delta]) + + if (is.null(ylab)) { + ylab <- "Observed survival" + if (substr(y[1], 1,4) %in% c("r.e2", "r.pp")) ylab <- "Net survival" + if (substr(y[1], 1,4) == "CIF_") ylab <- "Absolute risk" + if (substr(y[1], 1,6) == "CIF.rel") ylab <- "Absolute risk" + } + if (is.null(xlab)) xlab <- "Time from entry" + + ## attributes insurance to pass to lines.survtab + setattr(x, "survtab.meta", l$attrs$survtab.meta) + setattr(x, "class", c("survtab", "data.table", "data.frame")) + + ## plotting ------------------------------------------------------------------ + plot(I(c(min_y,max_y))~I(c(min_x,max_x)), data=x, type="n", + xlab = xlab, ylab = ylab, ...) + + + lines.survtab(x, subset = NULL, y = y, conf.int=conf.int, + col=col, lty=lty, ...) + + +} + + +#' \code{lines} method for survtab objects +#' +#' Plot \code{lines} from a \code{survtab} object +#' +#' @import graphics +#' +#' @author Joonas Miettinen +#' +#' @param x a \code{survtab} output object +#' @param y a variable to plot; a quoted name of a variable +#' in \code{x}; e.g. \code{y = "surv.obs"}; +#' if \code{NULL}, picks last survival variable column in order in \code{x} +#' @param subset a logical condition; \code{obj} is subset accordingly +#' before plotting; use this for limiting to specific strata, +#' e.g. \code{subset = sex == "male"} +#' @param conf.int logical; if \code{TRUE}, also plots any confidence intervals +#' present in \code{obj} for variables in \code{y} +#' @param col line colour passed to \code{matlines} +#' @param lty line type passed to \code{matlines} +#' @param ... additional arguments passed on to to a \code{matlines} call; +#' e.g. \code{lwd} can be defined this way +#' @examples +#' data(sire) +#' data(sibr) +#' si <- rbind(sire, sibr) +#' si$period <- cut(si$dg_date, as.Date(c("1993-01-01", "2004-01-01", "2013-01-01")), right = FALSE) +#' si$cancer <- c(rep("rectal", nrow(sire)), rep("breast", nrow(sibr))) +#' x <- lexpand(si, birth = bi_date, entry = dg_date, exit = ex_date, +#' status = status %in% 1:2, +#' fot = 0:5, aggre = list(cancer, period, fot)) +#' st <- survtab_ag(fot ~ cancer + period, data = x, +#' surv.method = "lifetable", surv.type = "surv.obs") +#' +#' plot(st, "surv.obs", subset = cancer == "breast", ylim = c(0.5, 1), col = "blue") +#' lines(st, "surv.obs", subset = cancer == "rectal", col = "red") +#' +#' ## or +#' plot(st, "surv.obs", col = c(2,2,4,4), lty = c(1, 2, 1, 2)) +#' @export +#' @family survtab functions +lines.survtab <- function(x, y = NULL, subset = NULL, + conf.int = TRUE, col=NULL, lty=NULL, ...) { + Tstop <- NULL ## APPEASE R CMD CHECK + ## prep ---------------------------------------------------------------------- + PF <- parent.frame(1L) + global_breaks <- attr(x, "survtab.meta")$surv.breaks + + subset <- substitute(subset) + subset <- evalLogicalSubset(data = x, subset, enclos = PF) + + + l <- prep_plot_survtab(x = x, y = y, subset = subset, + conf.int = conf.int, enclos = environment()) + x <- l$x + y <- l$y + y.ci <- l$y.ci + y.lo <- l$y.lo + y.hi <- l$y.hi + strata <- l$strata ## character vector of var names + + + ## impute first values (time = 0, surv = 1 / cif = 0) ------------------------ + + is_CIF <- if (substr(y, 1, 3) == "CIF") TRUE else FALSE + setkeyv(x, c(strata, "Tstop")) + first <- x[1, ] + if (length(strata)) first <- unique(x, by = strata) + first[, c(y) := ifelse(is_CIF, 0, 1)] + first$Tstop <- min(global_breaks) + + if (length(y.ci) > 0) first[, (y.ci) := get(y) ] + x <- rbindlist(list(first, x[, ]), use.names = TRUE) + setkeyv(x, c(strata, "Tstop")) + + ## plotting ------------------------------------------------------------------ + + if (is.null(lty)) { + lty <- list(c(1,2,2)) + if (!length(y.ci)) lty <- list(1) + } + + lines_by(x = "Tstop", y = c(y, y.ci), + strata.vars = strata, + data = x, col = col, lty = lty, ...) + + +} + + + +lines_by <- function(x, y, strata.vars = NULL, data, col, lty, ...) { + ## INTENTION: plots lines separately by strata, + ## which may have different colours / linetypes. + ## @param x a variable to plot y by; a character string + ## @param y a character vector of variables to plot by x; + ## e.g. the estimate and confidence interval variables + ## @param strata.vars a character string vector; variables + ## to add lines by, which may have different colours etc for identification + ## @param data a data.frame where x, y, and strata.vars are found + ## @param col a vector of colors passed to lines(); if vector length 1, + ## used for each level of strata. If vector length > 1, + ## has to match to total number of strata. If list, must match + ## to number of strata by length and contain elements of length + ## length(y). + ## @param see col; line type passed to lines(). + ## @param ... other arguments passed on to lines(). + + TF <- environment() + PF <- parent.frame(1L) + + stopifnot(is.data.frame(data)) + stopifnot(is.character(x) && length(x) == 1L) + stopifnot(is.character(y) && length(y) > 0L) + stopifnot(is.character(strata.vars) || is.null(strata.vars)) + all_names_present(data, c(x,y,strata.vars)) + + d <- mget(c(strata.vars, y, x), envir = as.environment(data)) + setDT(d) + setkeyv(d, c(strata.vars, x)) + + ## create list of datas + l <- list(d) + inter <- 1L + if (length(strata.vars)) { + inter <- do.call(interaction, d[, strata.vars, with = FALSE]) + l <- vector("list", uniqueN(inter)) + l <- split(d, f = inter, drop = TRUE) + } + + l <- lapply(l, function(tab) { + setDT(tab) + setcolsnull(tab, keep = c(x, y)) + tab + }) + + + ## figure out colours and ltys + for (objname in c("col", "lty")) { + obj <- TF[[objname]] + + if (missing(obj) || !length(obj)) obj <- 1 + if (!length(obj) %in% c(1, length(l))) { + stop("Argument ", objname, " is not of length 1 or ", + "of length equal to total number of strata (", + length(l), ").") + } + + ol <- unlist(lapply(obj, length)) + if (length(y) > 1 && is.list(obj) && !all(ol %in% c(1, length(y)))) { + stop("Argument y is of length > 1, and you passed ", + objname, " as a list of values, but at least one element is not ", + "of length 1 or length(y).") + } + + ## NOTE: rep works for vector and list just the same + if (length(obj) == 1) obj <- rep(obj, length(l)) + obj <- as.list(obj) + + assign(x = objname, value = obj) + } + + lapply(seq_along(l), function(i) { + + tab <- l[[i]] + cols <- col[[i]] + ltys <- lty[[i]] + + matlines(x = tab[[x]], y = tab[, y, with = FALSE], + col = cols, lty = ltys, ...) + + }) + + invisible(NULL) +} + + + + +#' @title Graphically Inspect Curves Used in Mean Survival Computation +#' @description Plots the observed (with extrapolation) and expected survival +#' curves for all strata in an object created by \code{\link{survmean}} +#' @author Joonas Miettinen +#' @param x a \code{survmean} object +#' @param ... arguments passed (ultimately) to \code{matlines}; you +#' may, therefore, supply e.g. \code{xlab} through this, though arguments +#' such as \code{lty} and \code{col} will not work +#' @details +#' +#' For examples see \code{\link{survmean}}. This function is intended only +#' for graphically inspecting that the observed survival curves with extrapolation +#' and the expected survival curves have been sensibly computed in \code{survmean}. +#' +#' If you want finer control over the plotted curves, extract the curves from +#' the \code{survmean} output using +#' +#' \code{attr(x, "curves")} +#' +#' where \code{x} is a \code{survmean} object. +#' @export +#' @family survmean functions +plot.survmean <- function(x, ...) { + at <- attr(x, "survmean.meta") + curves <- at$curves + if (is.null(curves)) { + stop("no curves information in x; sometimes lost if x ", + "altered after using survmean") + } + + by.vars <- at$tprint + by.vars <- c(by.vars, at$tadjust) + by.vars <- intersect(by.vars, names(curves)) + if (!length(by.vars)) by.vars <- NULL + + plot(curves$surv ~ curves$Tstop, type="n", + xlab = "Time from entry", ylab = "Survival") + lines.survmean(x, ...) + + subr <- at$breaks[[at$survScale]] + abline(v = max(subr), lty=2, col="grey") + + if (length(by.vars)) { + ## add legend denoting colors + Stratum <- curves[, unique(interaction(.SD)), .SDcols = eval(by.vars)] + legend(x = "topright", legend = Stratum, col = seq_along(Stratum), lty = 1) + } + +} + +#' @title Graphically Inspect Curves Used in Mean Survival Computation +#' @description Plots the observed (with extrapolation) and expected survival +#' curves for all strata in an object created by \code{\link{survmean}} +#' @author Joonas Miettinen +#' @param x a \code{survmean} object +#' @param ... arguments passed (ultimately) to \code{matlines}; you +#' may, therefore, supply e.g. \code{lwd} through this, though arguments +#' such as \code{lty} and \code{col} will not work +#' @details +#' +#' This function is intended to be a workhorse for \code{\link{plot.survmean}}. +#' If you want finer control over the plotted curves, extract the curves from +#' the \code{survmean} output using +#' +#' \code{attr(x, "curves")} +#' +#' where \code{x} is a \code{survmean} object. +#' @export +#' @family survmean functions +lines.survmean <- function(x, ...) { + at <- copy(attr(x, "survmean.meta")) + curves <- at$curves + if (is.null(curves)) stop("no curves information in x; usually lost if x altered after using survmean") + + by.vars <- at$tprint + by.vars <- c(by.vars, at$tadjust) + by.vars <- c("survmean_type", by.vars) + by.vars <- intersect(by.vars, names(curves)) + if (!length(by.vars)) by.vars <- NULL + + curves <- data.table(curves) + setkeyv(curves, c(by.vars, "Tstop")) + + type_levs <- length(levels(interaction(curves[, c(by.vars), with=FALSE])))/2L + other_levs <- 1L + if (length(by.vars) > 1) { + other_levs <- length(levels(interaction(curves[, setdiff(by.vars, "survmean_type"), with=FALSE]))) + } + + curves <- cast_simple(curves, columns = by.vars, rows = "Tstop", values = "surv") + matlines(x=curves$Tstop, y=curves[, setdiff(names(curves), "Tstop"), with=FALSE], + lty = rep(1:2, each=type_levs), col = 1:other_levs, ...) +} + + + + + + + +#' @export +getCall.survtab <- function(x, ...) { + attributes(x)$survtab.meta$call +} + + +#' @export +formula.survtab <- function(x, ...) { + attr(x, "survtab.meta")$arguments$formula +} + + + + + +#' @export +getCall.survmean <- function(x, ...) { + attributes(x)$survmean.meta$call +} + + +#' @export +formula.survmean <- function(x, ...) { + attr(x, "survmean.meta")$formula +} + + + diff --git a/R/aggregating.R b/R/aggregating.R index a44a465..11771a5 100644 --- a/R/aggregating.R +++ b/R/aggregating.R @@ -1,616 +1,616 @@ - -#' @title Set \code{aggre} attributes to an object by modifying in place -#' @author Joonas Miettinen -#' @description Coerces an R object to an \code{aggre} object, identifying -#' the object as one containing aggregated counts, person-years and other -#' information. \code{setaggre} modifies in place without taking any copies. -#' Retains all other attributes. -#' @param x a \code{data.frame} or \code{data.table} -#' @param values a character string vector; the names of value variables -#' @param by a character string vector; the names of variables by which -#' \code{values} have been tabulated -#' @param breaks a list of breaks, where each element is a breaks vector -#' as usually passed to e.g. \code{\link{splitLexisDT}}. The list must be -#' fully named, with the names corresponding to time scales at the aggregate -#' level in your data. Every unique value in a time scale variable in data must -#' also exist in the corresponding vector in the breaks list. -#' @details -#' -#' \code{setaggre} sets \code{x} to the \code{aggre} class in place -#' without taking a copy as e.g. \code{as.data.frame.XXX} functions do; see e.g. -#' \code{\link[data.table]{setDT}}. -#' -#' @family aggregation functions -#' -#' @export setaggre -#' @examples -#' df <- data.frame(sex = rep(c("male", "female"), each = 5), -#' obs = rpois(10, rep(7,5, each=5)), -#' pyrs = rpois(10, lambda = 10000)) -#' ## without any breaks -#' setaggre(df, values = c("obs", "pyrs"), by = "sex") -#' df <- data.frame(df) -#' df$FUT <- 0:4 -#' ## with breaks list -#' setaggre(df, values = c("obs", "pyrs"), by = "sex", breaks = list(FUT = 0:5)) -setaggre <- function(x, values = NULL, by = NULL, breaks = NULL) { - ## input: aggregated data in data.frame or data.table format - ## intention: any user can define their data as an aggregated data set - ## which will be usable by survtab / sir / other - ## output: no need to do x <- setaggre(x); instead modifies attributes in place; - ## sets "aggre.meta" attribute, a list of names of various variables. - ## survtab for aggregated data will need this attribute to work. - all_names_present(x, c(values, by)) - - if (!length(by) && length(values)) by <- setdiff(names(x), values) - if (length(by) && !length(values)) values <- setdiff(names(x), by) - - if (!inherits(x, "aggre")) { - cl <- class(x) - wh <- which(cl %in% c("data.table", "data.frame")) - wh <- min(wh) - - ## yes, from zero: in case only one class - cl <- c(cl[0:(wh-1)], "aggre", cl[wh:length(cl)]) - setattr(x, "class", cl) - } - - - setattr(x, "aggre.meta", list(values = values, by = by, breaks = breaks)) - setattr(x, "breaks", breaks) - invisible(x) -} - -#' @title Coercion to Class \code{aggre} -#' @author Joonas Miettinen -#' @description Coerces an R object to an \code{aggre} object, identifying -#' the object as one containing aggregated counts, person-years and other -#' information. -#' @inheritParams setaggre -#' @param ... arguments passed to or from methods -#' @family aggregation functions -#' -#' -#' @examples -#' library("data.table") -#' df <- data.frame(sex = rep(c("male", "female"), each = 5), -#' obs = rpois(10, rep(7,5, each=5)), -#' pyrs = rpois(10, lambda = 10000)) -#' dt <- as.data.table(df) -#' -#' df <- as.aggre(df, values = c("pyrs", "obs"), by = "sex") -#' dt <- as.aggre(dt, values = c("pyrs", "obs"), by = "sex") -#' -#' class(df) -#' class(dt) -#' -#' BL <- list(fot = 0:5) -#' df <- data.frame(df) -#' df <- as.aggre(df, values = c("pyrs", "obs"), by = "sex", breaks = BL) -#' -#' @export -as.aggre <- function(x, values = NULL, by = NULL, breaks = NULL, ...) { - UseMethod("as.aggre", x) -} - -#' @describeIn as.aggre Coerces a \code{data.frame} to an \code{aggre} object -#' @export -as.aggre.data.frame <- function(x, values = NULL, by = NULL, breaks = NULL, ...) { - x <- copy(x) - setaggre(x, values = values, by = by, breaks = breaks, ...) - setattr(x, "class", c("aggre", "data.frame")) - x[] -} - -#' @describeIn as.aggre Coerces a \code{data.table} to an \code{aggre} object -#' @export -as.aggre.data.table <- function(x, values = NULL, by = NULL, breaks = NULL, ...) { - x <- copy(x) - setaggre(x, values = values, by = by, breaks = breaks, ...) - setattr(x, "class", c("aggre", "data.table", "data.frame")) - x[] -} - -#' @describeIn as.aggre Default method for \code{as.aggre} (stops computations -#' if no class-specific method found) -#' @export -as.aggre.default <- function(x, ...) { - stop(gettextf("cannot coerce class \"%s\" to 'aggre'", deparse(class(x))), - domain = NA) -} - - - -#' @title Aggregation of split \code{Lexis} data -#' @author Joonas Miettinen -#' @description Aggregates a split \code{Lexis} object by given variables -#' and / or expressions into a long-format table of person-years and -#' transitions / end-points. Automatic aggregation over time scales -#' by which data has been split if the respective time scales are mentioned -#' in the aggregation argument to e.g. intervals of calendar time, follow-up time -#' and/or age. -#' @param lex a \code{Lexis} object split with e.g. -#' \code{\link[Epi]{splitLexis}} or \code{\link{splitMulti}} -#' @param by variables to tabulate (aggregate) by. -#' \link[=flexible_argument]{Flexible input}, typically e.g. -#' \code{by = c("V1", "V2")}. See Details and Examples. -#' @param type determines output levels to which data is aggregated varying -#' from returning only rows with \code{pyrs > 0} (\code{"unique"}) to -#' returning all possible combinations of variables given in \code{aggre} even -#' if those combinations are not represented in data (\code{"full"}); -#' see Details -#' @param sum.values optional: additional variables to sum by argument -#' \code{by}. \link[=flexible_argument]{Flexible input}, typically e.g. -#' \code{sum.values = c("V1", "V2")} -#' @param subset a logical condition to subset by before computations; -#' e.g. \code{subset = area \%in\% c("A", "B")} -#' @param verbose \code{logical}; if \code{TRUE}, the function returns timings -#' and some information useful for debugging along the aggregation process -#' @details -#' -#' \strong{Basics} -#' -#' \code{aggre} is intended for aggregation of split \code{Lexis} data only. -#' See \code{\link[Epi]{Lexis}} for forming \code{Lexis} objects by hand -#' and e.g. \code{\link[Epi]{splitLexis}}, \code{\link{splitLexisDT}}, and -#' \code{\link{splitMulti}} for splitting the data. \code{\link{lexpand}} -#' may be used for simple data sets to do both steps as well as aggregation -#' in the same function call. -#' -#' Here aggregation refers to computing person-years and the appropriate events -#' (state transitions and end points in status) for the subjects in the data. -#' Hence, it computes e.g. deaths (end-point and state transition) and -#' censorings (end-point) as well as events in a multi-state setting -#' (state transitions). -#' -#' The result is a long-format \code{data.frame} or \code{data.table} -#' (depending on \code{options("popEpi.datatable")}; see \code{?popEpi}) -#' with the columns \code{pyrs} and the appropriate transitions named as -#' \code{fromXtoY}, e.g. \code{from0to0} and \code{from0to1} depending -#' on the values of \code{lex.Cst} and \code{lex.Xst}. -#' -#' -#' \strong{The by argument} -#' -#' The \code{by} argument determines the length of the table, i.e. -#' the combinations of variables to which data is aggregated. -#' \code{by} is relatively flexible, as it can be supplied as -#' -#' \itemize{ -#' \item{a character string vector, e.g. \code{c("sex", "area")}, -#' naming variables existing in \code{lex}} -#' \item{an expression, e.g. \code{factor(sex, 0:1, c("m", "f"))} -#' using any variable found in \code{lex}} -#' \item{a list (fully or partially named) of expressions, e.g. -#' \code{list(gender = factor(sex, 0:1, c("m", "f"), area)}} -#' } -#' -#' Note that expressions effectively allow a variable to be supplied simply as -#' e.g. \code{by = sex} (as a symbol/name in R lingo). -#' -#' The data is then aggregated to the levels of the given variables -#' or expression(s). Variables defined to be time scales in the supplied -#' \code{Lexis} are processed in a special way: If any are mentioned in the -#' \code{by} argument, intervals of them are formed based on the breaks -#' used to split the data: e.g. if \code{age} was split using the breaks -#' \code{c(0, 50, Inf)}, mentioning \code{age} in \code{by} leads to -#' creating the \code{age} intervals \code{[0, 50)} and \code{[50, Inf)} -#' and aggregating to them. The intervals are identified in the output -#' as the lower bounds of the appropriate intervals. -#' -#' The order of multiple time scales mentioned in \code{by} matters, -#' as the last mentioned time scale is assumed to be a survival time scale -#' for when computing event counts. E.g. when the data is split by the breaks -#' \code{list(FUT = 0:5, CAL = c(2008,2010))}, time lines cut short at -#' \code{CAL = 2010} are considered to be censored, but time lines cut short at -#' \code{FUT = 5} are not. See Return. -#' -#' \strong{Aggregation types (styles)} -#' -#' It is almost always enough to aggregate the data to variable levels -#' that are actually represented in the data -#' (default \code{aggre = "unique"}; alias \code{"non-empty"}). -#' For certain uses it may be useful -#' to have also "empty" levels represented (resulting in some rows in output -#' with zero person-years and events); in these cases supplying -#' \code{aggre = "full"} (alias \code{"cartesian"}) causes \code{aggre} -#' to determine the Cartesian product of all the levels of the supplied -#' \code{by} variables or expressions and aggregate to them. As an example -#' of a Cartesian product, try -#' -#' \code{merge(1:2, 1:5)}. -#' -#' @return -#' A long \code{data.frame} or \code{data.table} of aggregated person-years -#' (\code{pyrs}), numbers of subjects at risk (\code{at.risk}), and events -#' formatted \code{fromXtoY}, where \code{X} and \code{X} are states -#' transitioning from and to or states at the end of each \code{lex.id}'s -#' follow-up (implying \code{X} = \code{Y}). Subjects at risk are computed -#' in the beginning of an interval defined by any Lexis time scales and -#' mentioned in \code{by}, but events occur at any point within an interval. -#' -#' When the data has been split along multiple time scales, the last -#' time scale mentioned in \code{by} is considered to be the survival time -#' scale with regard to computing events. Time lines cut short by the -#' extrema of non-survival-time-scales are considered to be censored -#' ("transitions" from the current state to the current state). -#' -#' @seealso \code{\link{aggregate}} for a similar base R solution, -#' and \code{\link{ltable}} for a \code{data.table} based aggregator. Neither -#' are directly applicable to split \code{Lexis} data. -#' -#' @family aggregation functions -#' -#' @examples -#' -#' ## form a Lexis object -#' library(Epi) -#' data(sibr) -#' x <- sibr[1:10,] -#' x[1:5,]$sex <- 0 ## pretend some are male -#' x <- Lexis(data = x, -#' entry = list(AGE = dg_age, CAL = get.yrs(dg_date)), -#' exit = list(CAL = get.yrs(ex_date)), -#' entry.status=0, exit.status = status) -#' x <- splitMulti(x, breaks = list(CAL = seq(1993, 2013, 5), -#' AGE = seq(0, 100, 50))) -#' -#' ## these produce the same results (with differing ways of determining aggre) -#' a1 <- aggre(x, by = list(gender = factor(sex, 0:1, c("m", "f")), -#' agegroup = AGE, period = CAL)) -#' -#' a2 <- aggre(x, by = c("sex", "AGE", "CAL")) -#' -#' a3 <- aggre(x, by = list(sex, agegroup = AGE, CAL)) -#' -#' ## returning also empty levels -#' a4 <- aggre(x, by = c("sex", "AGE", "CAL"), type = "full") -#' -#' ## computing also expected numbers of cases -#' x <- lexpand(sibr[1:10,], birth = bi_date, entry = dg_date, -#' exit = ex_date, status = status %in% 1:2, -#' pophaz = popmort, fot = 0:5, age = c(0, 50, 100)) -#' x$d.exp <- with(x, lex.dur*pop.haz) -#' ## these produce the same result -#' a5 <- aggre(x, by = c("sex", "age", "fot"), sum.values = list(d.exp)) -#' a5 <- aggre(x, by = c("sex", "age", "fot"), sum.values = "d.exp") -#' a5 <- aggre(x, by = c("sex", "age", "fot"), sum.values = d.exp) -#' ## same result here with custom name -#' a5 <- aggre(x, by = c("sex", "age", "fot"), -#' sum.values = list(expCases = d.exp)) -#' -#' ## computing pohar-perme weighted figures -#' x$d.exp.pp <- with(x, lex.dur*pop.haz*pp) -#' a6 <- aggre(x, by = c("sex", "age", "fot"), -#' sum.values = c("d.exp", "d.exp.pp")) -#' ## or equivalently e.g. sum.values = list(expCases = d.exp, expCases.p = d.exp.pp). -#' @export -aggre <- function(lex, by = NULL, type = c("unique", "full"), sum.values = NULL, subset = NULL, verbose = FALSE) { - - allTime <- proc.time() - - lex.Cst <- lex.Xst <- lex.id <- at.risk <- NULL ## APPEASE R CMD CHECK - - PF <- parent.frame(1L) - TF <- environment() - - type <- match.arg(type[1], c("non-empty", "unique", "full", "cartesian")) - if (type == "cartesian") type <- "full" - if (type == "non-empty") type <- "unique" - - if (verbose) cat("Aggregation type: '", type, "' \n", sep = "") - - checkLexisData(lex) - - breaks <- copy(attr(lex, "breaks")) - checkBreaksList(lex, breaks) - - allScales <- copy(attr(lex, "time.scales")) - if (length(allScales) == 0 ) { - stop("could not determine names of time scales; ", - "is the data a Lexis object?") - } - - ## subset -------------------------------------------------------------------- - subset <- substitute(subset) - subset <- evalLogicalSubset(lex, subset) - - ## check sum.values ---------------------------------------------------------- - sumSub <- substitute(sum.values) - sum.values <- evalPopArg(lex[1:min(nrow(lex), 20L), ], arg = sumSub, - enclos = PF, recursive = TRUE, DT = TRUE) - sumType <- attr(sum.values, "arg.type") - sumVars <- attr(sum.values, "all.vars") - sumSub <- attr(sum.values, "quoted.arg") - if (is.null(sum.values)) { - sumType <- "NULL" - sumVars <- NULL - sumSub <- quote(list()) - } - badSum <- names(sum.values)[!sapply(sum.values, is.numeric)] - if (length(badSum) > 0L) { - badSum <- paste0("'", badSum, "'", collapse = ", ") - stop("Following variables resulting from evaluating supplied sum.values ", - "argument are not numeric and cannot be summed: ", badSum, - ". Evaluated sum.values: ", deparse(sumSub)) - } - - - ## by argument type ------------------------------------------------------- - ## NOTE: need to eval by AFTER cutting time scales! - - ags <- substitute(by) - if (verbose) cat("Used by argument:", paste0(deparse(ags)),"\n") - - ## NOTE: with recursive = TRUE, evalPopArg digs deep enough to find - ## the actual expression (substituted only once) and returns that and other - ## things in attributes. Useful if arg substituted multiple times. - by <- evalPopArg(data = lex[1:min(nrow(lex), 20),], - arg = ags, DT = TRUE, enclos = PF, recursive = TRUE) - ags <- attr(by, "quoted.arg") - av <- attr(by, "all.vars") - argType <- attr(by, "arg.type") - - if (is.null(by)) { - ags <- substitute(list()) - av <- NULL - argType <- "NULL" - type <- "unique" - } - if (verbose) cat("Type of by argument:", argType, "\n") - - ## take copy of lex ---------------------------------------------------------- - ## if lex is a data.table, this function gets really complicated. - ## if copy is taken only of necessary vars, it should be fine. - keepVars <- unique(c("lex.id", allScales, "lex.dur", - "lex.Cst", "lex.Xst", av, sumVars)) - lex.orig <- lex - lex <- subsetDTorDF(lex, subset = subset, select = keepVars) - lex <- data.table(lex) - forceLexisDT(lex, breaks = breaks, allScales = allScales, key = FALSE) - - - ## ensure no observations outside breaks limits are left in - lex <- intelliDrop(lex, breaks = breaks) - - setkeyv(lex, c("lex.id", allScales[1])) - setcolsnull(lex, delete = setdiff(allScales, names(breaks))) - - - ## cut time scales for aggregating if needed --------------------------------- - aggScales <- intersect(av, allScales) - if (any(!aggScales %in% names(breaks))) { - aggScales <- paste0("'", setdiff(aggScales, names(breaks)), "'", collapse = ", ") - stop("Requested aggregating by time scale(s) by which data ", - "has not been split: ", aggScales) - } - - ## before cutting, find out which rows count towards "at.risk" figure: - ## of all scales in aggScales, the last one (or the only one) is assumed - ## to be the survival time scale. - tmpAtRisk <- makeTempVarName(lex, pre = "at.risk_") - set(lex, j = tmpAtRisk, value = TRUE) - survScale <- NULL - - - if (length(aggScales) > 0) { - cutTime <- proc.time() - ## "at.risk" counts subjects at risk in the beginning of the survival - ## time scale interval. - survScale <- aggScales[length(aggScales)] - lex[, c(tmpAtRisk) := lex[[survScale]] %in% breaks[[survScale]] ] - catAggScales <- paste0("'", aggScales, "'", collapse = ", ") - if (verbose) { - cat("Following time scales mentioned in by argument and will be", - "categorized into intervals (defined by breaks in object", - "attributes) for aggregation:", catAggScales, "\n") - } - - ## NEW METHOD: use a copy of lex and just modify in place. - - for (sc in aggScales) { - set(lex, j = sc, value = cutLow(lex[[sc]], breaks = breaks[[sc]])) - } - - if (verbose) cat("Time taken by cut()'ting time scales: ", timetaken(cutTime), "\n") - } - - othVars <- setdiff(av, aggScales) - if (verbose && length(othVars) > 0) { - catOthVars <- paste0("'", othVars, "'", collapse = ", ") - cat("Detected the following non-time-scale variables to be utilized in aggregating:", catOthVars, "\n") - } - - ## eval by ------------------------------------------------------------------- - ## NOTE: needed to eval by AFTER cutting time scales! - by <- evalPopArg(data = lex, arg = ags, DT = TRUE, enclos = PF, recursive = TRUE) - byNames <- names(by) - - ## computing pyrs ------------------------------------------------------------ - ## final step in determining at.risk: - ## a lex.id is at.risk only once per by-level - pyrsTime <- proc.time() - vdt <- data.table(pyrs = lex$lex.dur, at.risk = lex[[tmpAtRisk]], - lex.id = lex$lex.id) - pyrs <- vdt[, .(pyrs = sum(pyrs), - at.risk = sum(!duplicated(lex.id) & at.risk)), - keyby = by] - setDT(pyrs) - - rm(vdt) - sumNames <- NULL - if (sumType != "NULL") { - if (sumType == "character") { - sumNames <- evalPopArg(lex, sumSub, n = 1L, DT = FALSE, recursive = TRUE, enclos = PF) - sum.values <- lex[, lapply(.SD, sum), keyby = by, .SDcols = c(sumNames)] - } else { - sum.values <- evalPopArg(lex, sumSub, n = 1L, enclos = PF) - sumNames <- names(sum.values) - sumTmpNames <- makeTempVarName(lex, pre = sumNames) - set(lex, j = sumTmpNames, value = sum.values) - sum.values <- lex[, lapply(.SD, sum), keyby = by, .SDcols = sumTmpNames] - setnames(sum.values, sumTmpNames, sumNames) - setcolsnull(lex, sumTmpNames) - } - - setDT(sum.values) - pyrs <- merge(pyrs, sum.values, all = TRUE) - rm(sum.values) - } - - - if (verbose) cat("Time taken by aggregating pyrs: ", timetaken(pyrsTime), "\n") - - valVars <- setdiff(names(pyrs), byNames) ## includes pyrs and anything created by sum - - pyrs[is.na(pyrs), pyrs := 0] - pyrs <- pyrs[pyrs > 0] - - aggPyrs <- pyrs[, sum(pyrs)] - lexPyrs <- sum(lex.orig$lex.dur[subset]) - pyrsDiff <- aggPyrs - lexPyrs - if (!isTRUE(all.equal(aggPyrs, lexPyrs, scale = NULL))) { - warning("Found discrepancy of ", abs(round(pyrsDiff, 4)), " ", - "in total aggregated pyrs compared to ", - "sum(lex$lex.dur); compare results by hand and make sure ", - "settings are right \n") - } - rm(subset, aggPyrs, lexPyrs) - - ## cartesian output ---------------------------------------------------------- - if (type == "full") { - carTime <- proc.time() - - varsUsingScales <- NULL - - ## which variables used one time scale? and which one? - ## will only be used in cartesian stuff. - if (argType == "character") { - varsUsingScales <- intersect(by, aggScales) - whScaleUsed <- varsUsingScales - } else if (argType != "NULL") { - ## note: ags a substitute()'d list at this point always if not char - whScaleUsed <- lapply(ags[-1], function(x) intersect(all.vars(x), aggScales)) - ## only one time scale should be used in a variable! - oneScaleTest <- any(sapply(whScaleUsed, function(x) length(x) > 1L)) - if (oneScaleTest) stop("Only one Lexis time scale can be used in any one variable in by argument!") - varsUsingScales <- byNames[sapply(whScaleUsed, function (x) length(x) == 1L)] - whScaleUsed <- unlist(whScaleUsed) - } - - ceejay <- lapply(by, function(x) if (is.factor(x)) levels(x) else sort(unique(x))) - if (length(aggScales) > 0) { - ## which variables in ceejay used the Lexis time scales from lex? - - ceejay[varsUsingScales] <- lapply(breaks[whScaleUsed], function(x) x[-length(x)]) - } - - ceejay <- do.call(CJ, ceejay) - setkeyv(ceejay, byNames) - setkeyv(pyrs, byNames) - - pyrs <- pyrs[ceejay] - rm(ceejay) - - if (verbose) cat("Time taken by making aggregated data large in the cartesian product sense: ", timetaken(carTime), "\n") - } - - - ## computing events ---------------------------------------------------------- - - transTime <- proc.time() - - if (is.null(by) || (is.data.table(by) && nrow(by) == 0L)) { - - by <- quote(list(lex.Cst, lex.Xst)) - - } else { - for (var in c("lex.Cst", "lex.Xst")) { - set(by, j = var, value = lex[[var]]) - } - } - - - ## NOTE: this will ensure correct detection of censorings: - ## observations cut short by e.g. period window's edge - ## will be considered a censoring if the breaks along that time scale - ## are not passed to detectEvents (assuming the survival time scale is - ## used in by). If no time scale mentioned in by, then all endings - ## of observations are either censorings or events. - detBr <- breaks[survScale] - if (!length(survScale)) detBr <- NULL - hasEvent <- detectEvents(lex, breaks = detBr, by = "lex.id") %in% 1:2 - ## is language if user supplied by = NULL - if (!is.language(by)) by <- by[hasEvent] - - trans <- lex[hasEvent, list(obs = .N), keyby = by] - - rm(by, lex) - - - if (verbose) cat("Time taken by aggregating events: ", timetaken(transTime), "\n") - - ## casting & merging --------------------------------------------------------- - - mergeTime <- proc.time() - setDT(trans) - setDT(pyrs) - - ## tmpTr to be used in casting - tmpTr <- makeTempVarName(trans, pre = "trans_") - trans[, c(tmpTr) := paste0("from", lex.Cst, "to", lex.Xst)] - transitions <- sort(unique(trans[[tmpTr]])) - trans[, c("lex.Cst", "lex.Xst") := NULL] - - ## note: need tmpDum if by = NULL for correct casting & merging - tmpDum <- makeTempVarName(trans) - byNames <- c(byNames, tmpDum) - byNames <- setdiff(byNames, c("lex.Cst", "lex.Xst")) - trans[, c(tmpDum) := 1L] - pyrs[, c(tmpDum) := 1L] - - valVars <- unique(c(valVars, transitions)) - - trans <- cast_simple(trans, rows = byNames, columns = tmpTr, values = "obs") - - setkeyv(trans, NULL); setkeyv(pyrs, NULL) ## dcast.data.table seems to keep key but row order may be funky; this avoids a warning - setkeyv(trans, byNames); setkeyv(pyrs, byNames) - trans <- trans[pyrs]; rm(pyrs) - - trans[, c(tmpDum) := NULL] - byNames <- setdiff(byNames, tmpDum) - setcolorder(trans, c(byNames, valVars)) - - if (verbose) cat("Time taken by merging pyrs & transitions: ", timetaken(mergeTime), "\n") - - if (length(valVars) > 0L) { - trans[, c(valVars) := lapply(.SD, function(x) { - x[is.na(x)] <- 0 - x - }), .SDcols = c(valVars)] - } - - - ## final touch --------------------------------------------------------------- - trans <- data.table(trans) - setaggre(trans, values = c("pyrs", "at.risk", transitions, sumNames), - by = byNames, breaks = breaks) - if (!return_DT()) setDFpe(trans) - if (verbose) cat("Time taken by aggre(): ", timetaken(allTime), "\n") - - - - trans[] -} - - - - - - - - - - - - + +#' @title Set \code{aggre} attributes to an object by modifying in place +#' @author Joonas Miettinen +#' @description Coerces an R object to an \code{aggre} object, identifying +#' the object as one containing aggregated counts, person-years and other +#' information. \code{setaggre} modifies in place without taking any copies. +#' Retains all other attributes. +#' @param x a \code{data.frame} or \code{data.table} +#' @param values a character string vector; the names of value variables +#' @param by a character string vector; the names of variables by which +#' \code{values} have been tabulated +#' @param breaks a list of breaks, where each element is a breaks vector +#' as usually passed to e.g. \code{\link{splitLexisDT}}. The list must be +#' fully named, with the names corresponding to time scales at the aggregate +#' level in your data. Every unique value in a time scale variable in data must +#' also exist in the corresponding vector in the breaks list. +#' @details +#' +#' \code{setaggre} sets \code{x} to the \code{aggre} class in place +#' without taking a copy as e.g. \code{as.data.frame.XXX} functions do; see e.g. +#' \code{\link[data.table]{setDT}}. +#' +#' @family aggregation functions +#' +#' @export setaggre +#' @examples +#' df <- data.frame(sex = rep(c("male", "female"), each = 5), +#' obs = rpois(10, rep(7,5, each=5)), +#' pyrs = rpois(10, lambda = 10000)) +#' ## without any breaks +#' setaggre(df, values = c("obs", "pyrs"), by = "sex") +#' df <- data.frame(df) +#' df$FUT <- 0:4 +#' ## with breaks list +#' setaggre(df, values = c("obs", "pyrs"), by = "sex", breaks = list(FUT = 0:5)) +setaggre <- function(x, values = NULL, by = NULL, breaks = NULL) { + ## input: aggregated data in data.frame or data.table format + ## intention: any user can define their data as an aggregated data set + ## which will be usable by survtab / sir / other + ## output: no need to do x <- setaggre(x); instead modifies attributes in place; + ## sets "aggre.meta" attribute, a list of names of various variables. + ## survtab for aggregated data will need this attribute to work. + all_names_present(x, c(values, by)) + + if (!length(by) && length(values)) by <- setdiff(names(x), values) + if (length(by) && !length(values)) values <- setdiff(names(x), by) + + if (!inherits(x, "aggre")) { + cl <- class(x) + wh <- which(cl %in% c("data.table", "data.frame")) + wh <- min(wh) + + ## yes, from zero: in case only one class + cl <- c(cl[0:(wh-1)], "aggre", cl[wh:length(cl)]) + setattr(x, "class", cl) + } + + + setattr(x, "aggre.meta", list(values = values, by = by, breaks = breaks)) + setattr(x, "breaks", breaks) + invisible(x) +} + +#' @title Coercion to Class \code{aggre} +#' @author Joonas Miettinen +#' @description Coerces an R object to an \code{aggre} object, identifying +#' the object as one containing aggregated counts, person-years and other +#' information. +#' @inheritParams setaggre +#' @param ... arguments passed to or from methods +#' @family aggregation functions +#' +#' +#' @examples +#' library("data.table") +#' df <- data.frame(sex = rep(c("male", "female"), each = 5), +#' obs = rpois(10, rep(7,5, each=5)), +#' pyrs = rpois(10, lambda = 10000)) +#' dt <- as.data.table(df) +#' +#' df <- as.aggre(df, values = c("pyrs", "obs"), by = "sex") +#' dt <- as.aggre(dt, values = c("pyrs", "obs"), by = "sex") +#' +#' class(df) +#' class(dt) +#' +#' BL <- list(fot = 0:5) +#' df <- data.frame(df) +#' df <- as.aggre(df, values = c("pyrs", "obs"), by = "sex", breaks = BL) +#' +#' @export +as.aggre <- function(x, values = NULL, by = NULL, breaks = NULL, ...) { + UseMethod("as.aggre", x) +} + +#' @describeIn as.aggre Coerces a \code{data.frame} to an \code{aggre} object +#' @export +as.aggre.data.frame <- function(x, values = NULL, by = NULL, breaks = NULL, ...) { + x <- copy(x) + setaggre(x, values = values, by = by, breaks = breaks, ...) + setattr(x, "class", c("aggre", "data.frame")) + x[] +} + +#' @describeIn as.aggre Coerces a \code{data.table} to an \code{aggre} object +#' @export +as.aggre.data.table <- function(x, values = NULL, by = NULL, breaks = NULL, ...) { + x <- copy(x) + setaggre(x, values = values, by = by, breaks = breaks, ...) + setattr(x, "class", c("aggre", "data.table", "data.frame")) + x[] +} + +#' @describeIn as.aggre Default method for \code{as.aggre} (stops computations +#' if no class-specific method found) +#' @export +as.aggre.default <- function(x, ...) { + stop(gettextf("cannot coerce class \"%s\" to 'aggre'", deparse(class(x))), + domain = NA) +} + + + +#' @title Aggregation of split \code{Lexis} data +#' @author Joonas Miettinen +#' @description Aggregates a split \code{Lexis} object by given variables +#' and / or expressions into a long-format table of person-years and +#' transitions / end-points. Automatic aggregation over time scales +#' by which data has been split if the respective time scales are mentioned +#' in the aggregation argument to e.g. intervals of calendar time, follow-up time +#' and/or age. +#' @param lex a \code{Lexis} object split with e.g. +#' \code{\link[Epi]{splitLexis}} or \code{\link{splitMulti}} +#' @param by variables to tabulate (aggregate) by. +#' \link[=flexible_argument]{Flexible input}, typically e.g. +#' \code{by = c("V1", "V2")}. See Details and Examples. +#' @param type determines output levels to which data is aggregated varying +#' from returning only rows with \code{pyrs > 0} (\code{"unique"}) to +#' returning all possible combinations of variables given in \code{aggre} even +#' if those combinations are not represented in data (\code{"full"}); +#' see Details +#' @param sum.values optional: additional variables to sum by argument +#' \code{by}. \link[=flexible_argument]{Flexible input}, typically e.g. +#' \code{sum.values = c("V1", "V2")} +#' @param subset a logical condition to subset by before computations; +#' e.g. \code{subset = area \%in\% c("A", "B")} +#' @param verbose \code{logical}; if \code{TRUE}, the function returns timings +#' and some information useful for debugging along the aggregation process +#' @details +#' +#' \strong{Basics} +#' +#' \code{aggre} is intended for aggregation of split \code{Lexis} data only. +#' See \code{\link[Epi]{Lexis}} for forming \code{Lexis} objects by hand +#' and e.g. \code{\link[Epi]{splitLexis}}, \code{\link{splitLexisDT}}, and +#' \code{\link{splitMulti}} for splitting the data. \code{\link{lexpand}} +#' may be used for simple data sets to do both steps as well as aggregation +#' in the same function call. +#' +#' Here aggregation refers to computing person-years and the appropriate events +#' (state transitions and end points in status) for the subjects in the data. +#' Hence, it computes e.g. deaths (end-point and state transition) and +#' censorings (end-point) as well as events in a multi-state setting +#' (state transitions). +#' +#' The result is a long-format \code{data.frame} or \code{data.table} +#' (depending on \code{options("popEpi.datatable")}; see \code{?popEpi}) +#' with the columns \code{pyrs} and the appropriate transitions named as +#' \code{fromXtoY}, e.g. \code{from0to0} and \code{from0to1} depending +#' on the values of \code{lex.Cst} and \code{lex.Xst}. +#' +#' +#' \strong{The by argument} +#' +#' The \code{by} argument determines the length of the table, i.e. +#' the combinations of variables to which data is aggregated. +#' \code{by} is relatively flexible, as it can be supplied as +#' +#' \itemize{ +#' \item{a character string vector, e.g. \code{c("sex", "area")}, +#' naming variables existing in \code{lex}} +#' \item{an expression, e.g. \code{factor(sex, 0:1, c("m", "f"))} +#' using any variable found in \code{lex}} +#' \item{a list (fully or partially named) of expressions, e.g. +#' \code{list(gender = factor(sex, 0:1, c("m", "f"), area)}} +#' } +#' +#' Note that expressions effectively allow a variable to be supplied simply as +#' e.g. \code{by = sex} (as a symbol/name in R lingo). +#' +#' The data is then aggregated to the levels of the given variables +#' or expression(s). Variables defined to be time scales in the supplied +#' \code{Lexis} are processed in a special way: If any are mentioned in the +#' \code{by} argument, intervals of them are formed based on the breaks +#' used to split the data: e.g. if \code{age} was split using the breaks +#' \code{c(0, 50, Inf)}, mentioning \code{age} in \code{by} leads to +#' creating the \code{age} intervals \code{[0, 50)} and \code{[50, Inf)} +#' and aggregating to them. The intervals are identified in the output +#' as the lower bounds of the appropriate intervals. +#' +#' The order of multiple time scales mentioned in \code{by} matters, +#' as the last mentioned time scale is assumed to be a survival time scale +#' for when computing event counts. E.g. when the data is split by the breaks +#' \code{list(FUT = 0:5, CAL = c(2008,2010))}, time lines cut short at +#' \code{CAL = 2010} are considered to be censored, but time lines cut short at +#' \code{FUT = 5} are not. See Return. +#' +#' \strong{Aggregation types (styles)} +#' +#' It is almost always enough to aggregate the data to variable levels +#' that are actually represented in the data +#' (default \code{aggre = "unique"}; alias \code{"non-empty"}). +#' For certain uses it may be useful +#' to have also "empty" levels represented (resulting in some rows in output +#' with zero person-years and events); in these cases supplying +#' \code{aggre = "full"} (alias \code{"cartesian"}) causes \code{aggre} +#' to determine the Cartesian product of all the levels of the supplied +#' \code{by} variables or expressions and aggregate to them. As an example +#' of a Cartesian product, try +#' +#' \code{merge(1:2, 1:5)}. +#' +#' @return +#' A long \code{data.frame} or \code{data.table} of aggregated person-years +#' (\code{pyrs}), numbers of subjects at risk (\code{at.risk}), and events +#' formatted \code{fromXtoY}, where \code{X} and \code{X} are states +#' transitioning from and to or states at the end of each \code{lex.id}'s +#' follow-up (implying \code{X} = \code{Y}). Subjects at risk are computed +#' in the beginning of an interval defined by any Lexis time scales and +#' mentioned in \code{by}, but events occur at any point within an interval. +#' +#' When the data has been split along multiple time scales, the last +#' time scale mentioned in \code{by} is considered to be the survival time +#' scale with regard to computing events. Time lines cut short by the +#' extrema of non-survival-time-scales are considered to be censored +#' ("transitions" from the current state to the current state). +#' +#' @seealso \code{\link{aggregate}} for a similar base R solution, +#' and \code{\link{ltable}} for a \code{data.table} based aggregator. Neither +#' are directly applicable to split \code{Lexis} data. +#' +#' @family aggregation functions +#' +#' @examples +#' +#' ## form a Lexis object +#' library(Epi) +#' data(sibr) +#' x <- sibr[1:10,] +#' x[1:5,]$sex <- 0 ## pretend some are male +#' x <- Lexis(data = x, +#' entry = list(AGE = dg_age, CAL = get.yrs(dg_date)), +#' exit = list(CAL = get.yrs(ex_date)), +#' entry.status=0, exit.status = status) +#' x <- splitMulti(x, breaks = list(CAL = seq(1993, 2013, 5), +#' AGE = seq(0, 100, 50))) +#' +#' ## these produce the same results (with differing ways of determining aggre) +#' a1 <- aggre(x, by = list(gender = factor(sex, 0:1, c("m", "f")), +#' agegroup = AGE, period = CAL)) +#' +#' a2 <- aggre(x, by = c("sex", "AGE", "CAL")) +#' +#' a3 <- aggre(x, by = list(sex, agegroup = AGE, CAL)) +#' +#' ## returning also empty levels +#' a4 <- aggre(x, by = c("sex", "AGE", "CAL"), type = "full") +#' +#' ## computing also expected numbers of cases +#' x <- lexpand(sibr[1:10,], birth = bi_date, entry = dg_date, +#' exit = ex_date, status = status %in% 1:2, +#' pophaz = popmort, fot = 0:5, age = c(0, 50, 100)) +#' x$d.exp <- with(x, lex.dur*pop.haz) +#' ## these produce the same result +#' a5 <- aggre(x, by = c("sex", "age", "fot"), sum.values = list(d.exp)) +#' a5 <- aggre(x, by = c("sex", "age", "fot"), sum.values = "d.exp") +#' a5 <- aggre(x, by = c("sex", "age", "fot"), sum.values = d.exp) +#' ## same result here with custom name +#' a5 <- aggre(x, by = c("sex", "age", "fot"), +#' sum.values = list(expCases = d.exp)) +#' +#' ## computing pohar-perme weighted figures +#' x$d.exp.pp <- with(x, lex.dur*pop.haz*pp) +#' a6 <- aggre(x, by = c("sex", "age", "fot"), +#' sum.values = c("d.exp", "d.exp.pp")) +#' ## or equivalently e.g. sum.values = list(expCases = d.exp, expCases.p = d.exp.pp). +#' @export +aggre <- function(lex, by = NULL, type = c("unique", "full"), sum.values = NULL, subset = NULL, verbose = FALSE) { + + allTime <- proc.time() + + lex.Cst <- lex.Xst <- lex.id <- at.risk <- NULL ## APPEASE R CMD CHECK + + PF <- parent.frame(1L) + TF <- environment() + + type <- match.arg(type[1], c("non-empty", "unique", "full", "cartesian")) + if (type == "cartesian") type <- "full" + if (type == "non-empty") type <- "unique" + + if (verbose) cat("Aggregation type: '", type, "' \n", sep = "") + + checkLexisData(lex) + + breaks <- copy(attr(lex, "breaks")) + checkBreaksList(lex, breaks) + + allScales <- copy(attr(lex, "time.scales")) + if (length(allScales) == 0 ) { + stop("could not determine names of time scales; ", + "is the data a Lexis object?") + } + + ## subset -------------------------------------------------------------------- + subset <- substitute(subset) + subset <- evalLogicalSubset(lex, subset) + + ## check sum.values ---------------------------------------------------------- + sumSub <- substitute(sum.values) + sum.values <- evalPopArg(lex[1:min(nrow(lex), 20L), ], arg = sumSub, + enclos = PF, recursive = TRUE, DT = TRUE) + sumType <- attr(sum.values, "arg.type") + sumVars <- attr(sum.values, "all.vars") + sumSub <- attr(sum.values, "quoted.arg") + if (is.null(sum.values)) { + sumType <- "NULL" + sumVars <- NULL + sumSub <- quote(list()) + } + badSum <- names(sum.values)[!sapply(sum.values, is.numeric)] + if (length(badSum) > 0L) { + badSum <- paste0("'", badSum, "'", collapse = ", ") + stop("Following variables resulting from evaluating supplied sum.values ", + "argument are not numeric and cannot be summed: ", badSum, + ". Evaluated sum.values: ", deparse(sumSub)) + } + + + ## by argument type ------------------------------------------------------- + ## NOTE: need to eval by AFTER cutting time scales! + + ags <- substitute(by) + if (verbose) cat("Used by argument:", paste0(deparse(ags)),"\n") + + ## NOTE: with recursive = TRUE, evalPopArg digs deep enough to find + ## the actual expression (substituted only once) and returns that and other + ## things in attributes. Useful if arg substituted multiple times. + by <- evalPopArg(data = lex[1:min(nrow(lex), 20),], + arg = ags, DT = TRUE, enclos = PF, recursive = TRUE) + ags <- attr(by, "quoted.arg") + av <- attr(by, "all.vars") + argType <- attr(by, "arg.type") + + if (is.null(by)) { + ags <- substitute(list()) + av <- NULL + argType <- "NULL" + type <- "unique" + } + if (verbose) cat("Type of by argument:", argType, "\n") + + ## take copy of lex ---------------------------------------------------------- + ## if lex is a data.table, this function gets really complicated. + ## if copy is taken only of necessary vars, it should be fine. + keepVars <- unique(c("lex.id", allScales, "lex.dur", + "lex.Cst", "lex.Xst", av, sumVars)) + lex.orig <- lex + lex <- subsetDTorDF(lex, subset = subset, select = keepVars) + lex <- data.table(lex) + forceLexisDT(lex, breaks = breaks, allScales = allScales, key = FALSE) + + + ## ensure no observations outside breaks limits are left in + lex <- intelliDrop(lex, breaks = breaks) + + setkeyv(lex, c("lex.id", allScales[1])) + setcolsnull(lex, delete = setdiff(allScales, names(breaks))) + + + ## cut time scales for aggregating if needed --------------------------------- + aggScales <- intersect(av, allScales) + if (any(!aggScales %in% names(breaks))) { + aggScales <- paste0("'", setdiff(aggScales, names(breaks)), "'", collapse = ", ") + stop("Requested aggregating by time scale(s) by which data ", + "has not been split: ", aggScales) + } + + ## before cutting, find out which rows count towards "at.risk" figure: + ## of all scales in aggScales, the last one (or the only one) is assumed + ## to be the survival time scale. + tmpAtRisk <- makeTempVarName(lex, pre = "at.risk_") + set(lex, j = tmpAtRisk, value = TRUE) + survScale <- NULL + + + if (length(aggScales) > 0) { + cutTime <- proc.time() + ## "at.risk" counts subjects at risk in the beginning of the survival + ## time scale interval. + survScale <- aggScales[length(aggScales)] + lex[, c(tmpAtRisk) := lex[[survScale]] %in% breaks[[survScale]] ] + catAggScales <- paste0("'", aggScales, "'", collapse = ", ") + if (verbose) { + cat("Following time scales mentioned in by argument and will be", + "categorized into intervals (defined by breaks in object", + "attributes) for aggregation:", catAggScales, "\n") + } + + ## NEW METHOD: use a copy of lex and just modify in place. + + for (sc in aggScales) { + set(lex, j = sc, value = cutLow(lex[[sc]], breaks = breaks[[sc]])) + } + + if (verbose) cat("Time taken by cut()'ting time scales: ", timetaken(cutTime), "\n") + } + + othVars <- setdiff(av, aggScales) + if (verbose && length(othVars) > 0) { + catOthVars <- paste0("'", othVars, "'", collapse = ", ") + cat("Detected the following non-time-scale variables to be utilized in aggregating:", catOthVars, "\n") + } + + ## eval by ------------------------------------------------------------------- + ## NOTE: needed to eval by AFTER cutting time scales! + by <- evalPopArg(data = lex, arg = ags, DT = TRUE, enclos = PF, recursive = TRUE) + byNames <- names(by) + + ## computing pyrs ------------------------------------------------------------ + ## final step in determining at.risk: + ## a lex.id is at.risk only once per by-level + pyrsTime <- proc.time() + vdt <- data.table(pyrs = lex$lex.dur, at.risk = lex[[tmpAtRisk]], + lex.id = lex$lex.id) + pyrs <- vdt[, .(pyrs = sum(pyrs), + at.risk = sum(!duplicated(lex.id) & at.risk)), + keyby = by] + setDT(pyrs) + + rm(vdt) + sumNames <- NULL + if (sumType != "NULL") { + if (sumType == "character") { + sumNames <- evalPopArg(lex, sumSub, n = 1L, DT = FALSE, recursive = TRUE, enclos = PF) + sum.values <- lex[, lapply(.SD, sum), keyby = by, .SDcols = c(sumNames)] + } else { + sum.values <- evalPopArg(lex, sumSub, n = 1L, enclos = PF) + sumNames <- names(sum.values) + sumTmpNames <- makeTempVarName(lex, pre = sumNames) + set(lex, j = sumTmpNames, value = sum.values) + sum.values <- lex[, lapply(.SD, sum), keyby = by, .SDcols = sumTmpNames] + setnames(sum.values, sumTmpNames, sumNames) + setcolsnull(lex, sumTmpNames) + } + + setDT(sum.values) + pyrs <- merge(pyrs, sum.values, all = TRUE) + rm(sum.values) + } + + + if (verbose) cat("Time taken by aggregating pyrs: ", timetaken(pyrsTime), "\n") + + valVars <- setdiff(names(pyrs), byNames) ## includes pyrs and anything created by sum + + pyrs[is.na(pyrs), pyrs := 0] + pyrs <- pyrs[pyrs > 0] + + aggPyrs <- pyrs[, sum(pyrs)] + lexPyrs <- sum(lex.orig$lex.dur[subset]) + pyrsDiff <- aggPyrs - lexPyrs + if (!isTRUE(all.equal(aggPyrs, lexPyrs, scale = NULL))) { + warning("Found discrepancy of ", abs(round(pyrsDiff, 4)), " ", + "in total aggregated pyrs compared to ", + "sum(lex$lex.dur); compare results by hand and make sure ", + "settings are right \n") + } + rm(subset, aggPyrs, lexPyrs) + + ## cartesian output ---------------------------------------------------------- + if (type == "full") { + carTime <- proc.time() + + varsUsingScales <- NULL + + ## which variables used one time scale? and which one? + ## will only be used in cartesian stuff. + if (argType == "character") { + varsUsingScales <- intersect(by, aggScales) + whScaleUsed <- varsUsingScales + } else if (argType != "NULL") { + ## note: ags a substitute()'d list at this point always if not char + whScaleUsed <- lapply(ags[-1], function(x) intersect(all.vars(x), aggScales)) + ## only one time scale should be used in a variable! + oneScaleTest <- any(sapply(whScaleUsed, function(x) length(x) > 1L)) + if (oneScaleTest) stop("Only one Lexis time scale can be used in any one variable in by argument!") + varsUsingScales <- byNames[sapply(whScaleUsed, function (x) length(x) == 1L)] + whScaleUsed <- unlist(whScaleUsed) + } + + ceejay <- lapply(by, function(x) if (is.factor(x)) levels(x) else sort(unique(x))) + if (length(aggScales) > 0) { + ## which variables in ceejay used the Lexis time scales from lex? + + ceejay[varsUsingScales] <- lapply(breaks[whScaleUsed], function(x) x[-length(x)]) + } + + ceejay <- do.call(CJ, ceejay) + setkeyv(ceejay, byNames) + setkeyv(pyrs, byNames) + + pyrs <- pyrs[ceejay] + rm(ceejay) + + if (verbose) cat("Time taken by making aggregated data large in the cartesian product sense: ", timetaken(carTime), "\n") + } + + + ## computing events ---------------------------------------------------------- + + transTime <- proc.time() + + if (is.null(by) || (is.data.table(by) && nrow(by) == 0L)) { + + by <- quote(list(lex.Cst, lex.Xst)) + + } else { + for (var in c("lex.Cst", "lex.Xst")) { + set(by, j = var, value = lex[[var]]) + } + } + + + ## NOTE: this will ensure correct detection of censorings: + ## observations cut short by e.g. period window's edge + ## will be considered a censoring if the breaks along that time scale + ## are not passed to detectEvents (assuming the survival time scale is + ## used in by). If no time scale mentioned in by, then all endings + ## of observations are either censorings or events. + detBr <- breaks[survScale] + if (!length(survScale)) detBr <- NULL + hasEvent <- detectEvents(lex, breaks = detBr, by = "lex.id") %in% 1:2 + ## is language if user supplied by = NULL + if (!is.language(by)) by <- by[hasEvent] + + trans <- lex[hasEvent, list(obs = .N), keyby = by] + + rm(by, lex) + + + if (verbose) cat("Time taken by aggregating events: ", timetaken(transTime), "\n") + + ## casting & merging --------------------------------------------------------- + + mergeTime <- proc.time() + setDT(trans) + setDT(pyrs) + + ## tmpTr to be used in casting + tmpTr <- makeTempVarName(trans, pre = "trans_") + trans[, c(tmpTr) := paste0("from", lex.Cst, "to", lex.Xst)] + transitions <- sort(unique(trans[[tmpTr]])) + trans[, c("lex.Cst", "lex.Xst") := NULL] + + ## note: need tmpDum if by = NULL for correct casting & merging + tmpDum <- makeTempVarName(trans) + byNames <- c(byNames, tmpDum) + byNames <- setdiff(byNames, c("lex.Cst", "lex.Xst")) + trans[, c(tmpDum) := 1L] + pyrs[, c(tmpDum) := 1L] + + valVars <- unique(c(valVars, transitions)) + + trans <- cast_simple(trans, rows = byNames, columns = tmpTr, values = "obs") + + setkeyv(trans, NULL); setkeyv(pyrs, NULL) ## dcast.data.table seems to keep key but row order may be funky; this avoids a warning + setkeyv(trans, byNames); setkeyv(pyrs, byNames) + trans <- trans[pyrs]; rm(pyrs) + + trans[, c(tmpDum) := NULL] + byNames <- setdiff(byNames, tmpDum) + setcolorder(trans, c(byNames, valVars)) + + if (verbose) cat("Time taken by merging pyrs & transitions: ", timetaken(mergeTime), "\n") + + if (length(valVars) > 0L) { + trans[, c(valVars) := lapply(.SD, function(x) { + x[is.na(x)] <- 0 + x + }), .SDcols = c(valVars)] + } + + + ## final touch --------------------------------------------------------------- + trans <- data.table(trans) + setaggre(trans, values = c("pyrs", "at.risk", transitions, sumNames), + by = byNames, breaks = breaks) + if (!return_DT()) setDFpe(trans) + if (verbose) cat("Time taken by aggre(): ", timetaken(allTime), "\n") + + + + trans[] +} + + + + + + + + + + + + diff --git a/R/data_document.R b/R/data_document.R index 5bedb4d..1ec1f10 100644 --- a/R/data_document.R +++ b/R/data_document.R @@ -1,174 +1,174 @@ - - - -# sire - simulated survival data ------------------------------------------ - -#' sire - a simulated cohort of Finnish female rectal cancer patients -#' -#' \code{sire} is a simulated cohort pertaining female Finnish rectal cancer patients -#' diagnosed between 1993-2012. Instead of actual original dates, the dates are masked -#' via modest randomization within several time windows. -#' -#' The closing date for the pertinent data was 2012-12-31, meaning status information was -#' available only up to that point --- hence the maximum possible \code{ex_date} is \code{2012-12-31}. -#' -#' @source The Finnish Cancer Registry -#' @format data.table with columns -#' \itemize{ -#' \item sex - gender of the patient (1 = female) -#' \item bi_date - date of birth -#' \item dg_date - date of cancer diagnosis -#' \item ex_date - date of exit from follow-up (death or censoring) -#' \item status - status of the person at exit; 0 alive; 1 dead due to pertinent cancer; 2 dead due to other causes -#' \item dg_age - age at diagnosis expressed as fractional years -#' } -#' @author Karri Seppa -#' @name sire -#' @family popEpi data -#' @family survival data -NULL - - -# sibr - simulated survival data ------------------------------------------ - -#' sibr - a simulated cohort of Finnish female breast cancer patients -#' -#' \code{sibr} is a simulated cohort pertaining female Finnish breast cancer patients -#' diagnosed between 1993-2012. Instead of actual original dates, the dates are masked -#' via modest randomization within several time windows. The dataset is additionally -#' a random sample of 10 000 cases from the pertaining time window. -#' -#' The closing date for the pertinent data was 2012-12-31, meaning status information was -#' available only up to that point --- hence the maximum possible \code{ex_date} is \code{2012-12-31}. -#' -#' @source The Finnish Cancer Registry -#' @format data.table with columns -#' \itemize{ -#' \item sex - gender of the patient (1 = female) -#' \item bi_date - date of birth -#' \item dg_date - date of cancer diagnosis -#' \item ex_date - date of exit from follow-up (death or censoring) -#' \item status - status of the person at exit; 0 alive; 1 dead due to pertinent cancer; 2 dead due to other causes -#' \item dg_age - age at diagnosis expressed as fractional years -#' } -#' @author Karri Seppa -#' @name sibr -#' @family popEpi data -#' @family survival data -NULL - - - - -# International standard weights ------------------------------------------ -#' Age standardisation weights from the ICSS scheme. -#' -#' Contains three sets age-standardisation weights for age-standardized survival (net, relative or observed). -#' -#' -#' @source -#' \href{http://seer.cancer.gov/stdpopulations/survival.html}{ICSS weights (US National Cancer Institute website)} -#' -#' Corazziari, Isabella, Mike Quinn, and Riccardo Capocaccia. "Standard cancer patient population for age standardising survival ratios." European Journal of Cancer 40.15 (2004): 2307-2316. -#' @format data.table with columns -#' \itemize{ -#' \item age - lower bound of the age group -#' \item ICSS1 - first set of weights, sums to 100 000 -#' \item ICSS2 - second set of weights, sums to 100 000 -#' \item ICSS3 - third set of weights, sums to 100 000 -#' } -#' @name ICSS -#' @family popEpi data -#' @family weights -#' @examples -#' ## aggregate weights to a subset of age groups -#' data(ICSS) -#' cut <- c(0, 30, 50, 70, Inf) -#' agegr <- cut(ICSS$age, cut, right = FALSE) -#' aggregate(ICSS1~agegr, data = ICSS, FUN = sum) -NULL - - -# popmort ------------------------------------------------------------------- -#' Population mortality rates in Finland 1951 - 2013 in 101 age groups and -#' by gender -#' -#' -#' @source Statistics Finland -#' @format \code{data.table} with columns -#' \itemize{ -#' \item \code{sex} gender coded as male, female (0, 1) -#' \item \code{year} calendar year -#' \item \code{agegroup} - coded 0 to 100; one-year age groups -#' \item \code{haz} the average population mortality rate per person year -#' } -#' @name popmort -#' @family popEpi data -#' @seealso \code{\link{pophaz}} -NULL - - - - -# stdpop18 ------------------------------------------------------------------ - -#' Standard populations from 2000: world, Europe and Nordic. -#' -#' World, European, and Nordic standard populations by 18 age categories. -#' Sums to 100000. -#' -#' @source Nordcan, 2000 -#' @format data.table with columns -#' \itemize{ -#' \item \code{agegroup}, age group in 18 categories (character) -#' \item \code{world}, World 2000 standard population (numeric) -#' \item \code{europe}, European standard population (numeric) -#' \item \code{nordic}, Nordic standard population (numeric) -#' } -#' @name stdpop18 -#' @family popEpi data -#' @family weights -NULL - - -# stdpop101 ----------------------------------------------------------------- - -#' World standard population by 1 year age groups from 1 to 101. Sums to 100 000. -#' -#' -#' @source Standard population is from: -#' \href{http://seer.cancer.gov/stdpopulations/stdpop.singleages.html}{world standard population "101of1"} -#' -#' @format data.table with columns -#' \itemize{ -#' \item \code{world_std} weight that sums to 100000 (numeric) -#' \item \code{agegroup} age group from 1 to 101 (numeric) -#' } -#' @name stdpop101 -#' @family popEpi data -#' @family weights -NULL - - - - - -# meanpop_fi ------------------------------------------------------------------- -#' Mean population counts in Finland year, sex, and age group. -#' -#' @source Statistics Finland -#' @format \code{data.table} with columns -#' \itemize{ -#' \item \code{sex} gender coded as male, female (0, 1) -#' \item \code{year} calendar year 1981-2016 -#' \item \code{agegroup} - coded 0 to 100; one-year age groups -#' \item \code{meanpop} the mean population count; that is, the mean of the -#' annual population counts of two consecutive years; e.g. for 1990 -#' \code{meanpop} is the mean of population counts for 1990 and 1991 -#' (counted at 1990-01-01 and 1991-01-01, respectively) -#' } -#' @name meanpop_fi -#' @family popEpi data -#' @seealso \href{http://pxnet2.stat.fi/PXWeb/pxweb/fi/StatFin/StatFin__vrm__vaerak/071_vaerak_tau_109.px/?rxid=81efcb98-00c6-46ba-9f8f-8bc6f110895f}{Table on the web} -NULL - + + + +# sire - simulated survival data ------------------------------------------ + +#' sire - a simulated cohort of Finnish female rectal cancer patients +#' +#' \code{sire} is a simulated cohort pertaining female Finnish rectal cancer patients +#' diagnosed between 1993-2012. Instead of actual original dates, the dates are masked +#' via modest randomization within several time windows. +#' +#' The closing date for the pertinent data was 2012-12-31, meaning status information was +#' available only up to that point --- hence the maximum possible \code{ex_date} is \code{2012-12-31}. +#' +#' @source The Finnish Cancer Registry +#' @format data.table with columns +#' \itemize{ +#' \item sex - gender of the patient (1 = female) +#' \item bi_date - date of birth +#' \item dg_date - date of cancer diagnosis +#' \item ex_date - date of exit from follow-up (death or censoring) +#' \item status - status of the person at exit; 0 alive; 1 dead due to pertinent cancer; 2 dead due to other causes +#' \item dg_age - age at diagnosis expressed as fractional years +#' } +#' @author Karri Seppa +#' @name sire +#' @family popEpi data +#' @family survival data +NULL + + +# sibr - simulated survival data ------------------------------------------ + +#' sibr - a simulated cohort of Finnish female breast cancer patients +#' +#' \code{sibr} is a simulated cohort pertaining female Finnish breast cancer patients +#' diagnosed between 1993-2012. Instead of actual original dates, the dates are masked +#' via modest randomization within several time windows. The dataset is additionally +#' a random sample of 10 000 cases from the pertaining time window. +#' +#' The closing date for the pertinent data was 2012-12-31, meaning status information was +#' available only up to that point --- hence the maximum possible \code{ex_date} is \code{2012-12-31}. +#' +#' @source The Finnish Cancer Registry +#' @format data.table with columns +#' \itemize{ +#' \item sex - gender of the patient (1 = female) +#' \item bi_date - date of birth +#' \item dg_date - date of cancer diagnosis +#' \item ex_date - date of exit from follow-up (death or censoring) +#' \item status - status of the person at exit; 0 alive; 1 dead due to pertinent cancer; 2 dead due to other causes +#' \item dg_age - age at diagnosis expressed as fractional years +#' } +#' @author Karri Seppa +#' @name sibr +#' @family popEpi data +#' @family survival data +NULL + + + + +# International standard weights ------------------------------------------ +#' Age standardisation weights from the ICSS scheme. +#' +#' Contains three sets age-standardisation weights for age-standardized survival (net, relative or observed). +#' +#' +#' @source +#' \href{http://seer.cancer.gov/stdpopulations/survival.html}{ICSS weights (US National Cancer Institute website)} +#' +#' Corazziari, Isabella, Mike Quinn, and Riccardo Capocaccia. "Standard cancer patient population for age standardising survival ratios." European Journal of Cancer 40.15 (2004): 2307-2316. +#' @format data.table with columns +#' \itemize{ +#' \item age - lower bound of the age group +#' \item ICSS1 - first set of weights, sums to 100 000 +#' \item ICSS2 - second set of weights, sums to 100 000 +#' \item ICSS3 - third set of weights, sums to 100 000 +#' } +#' @name ICSS +#' @family popEpi data +#' @family weights +#' @examples +#' ## aggregate weights to a subset of age groups +#' data(ICSS) +#' cut <- c(0, 30, 50, 70, Inf) +#' agegr <- cut(ICSS$age, cut, right = FALSE) +#' aggregate(ICSS1~agegr, data = ICSS, FUN = sum) +NULL + + +# popmort ------------------------------------------------------------------- +#' Population mortality rates in Finland 1951 - 2013 in 101 age groups and +#' by gender +#' +#' +#' @source Statistics Finland +#' @format \code{data.table} with columns +#' \itemize{ +#' \item \code{sex} gender coded as male, female (0, 1) +#' \item \code{year} calendar year +#' \item \code{agegroup} - coded 0 to 100; one-year age groups +#' \item \code{haz} the average population mortality rate per person year +#' } +#' @name popmort +#' @family popEpi data +#' @seealso \code{\link{pophaz}} +NULL + + + + +# stdpop18 ------------------------------------------------------------------ + +#' Standard populations from 2000: world, Europe and Nordic. +#' +#' World, European, and Nordic standard populations by 18 age categories. +#' Sums to 100000. +#' +#' @source Nordcan, 2000 +#' @format data.table with columns +#' \itemize{ +#' \item \code{agegroup}, age group in 18 categories (character) +#' \item \code{world}, World 2000 standard population (numeric) +#' \item \code{europe}, European standard population (numeric) +#' \item \code{nordic}, Nordic standard population (numeric) +#' } +#' @name stdpop18 +#' @family popEpi data +#' @family weights +NULL + + +# stdpop101 ----------------------------------------------------------------- + +#' World standard population by 1 year age groups from 1 to 101. Sums to 100 000. +#' +#' +#' @source Standard population is from: +#' \href{http://seer.cancer.gov/stdpopulations/stdpop.singleages.html}{world standard population "101of1"} +#' +#' @format data.table with columns +#' \itemize{ +#' \item \code{world_std} weight that sums to 100000 (numeric) +#' \item \code{agegroup} age group from 1 to 101 (numeric) +#' } +#' @name stdpop101 +#' @family popEpi data +#' @family weights +NULL + + + + + +# meanpop_fi ------------------------------------------------------------------- +#' Mean population counts in Finland year, sex, and age group. +#' +#' @source Statistics Finland +#' @format \code{data.table} with columns +#' \itemize{ +#' \item \code{sex} gender coded as male, female (0, 1) +#' \item \code{year} calendar year 1981-2016 +#' \item \code{agegroup} - coded 0 to 100; one-year age groups +#' \item \code{meanpop} the mean population count; that is, the mean of the +#' annual population counts of two consecutive years; e.g. for 1990 +#' \code{meanpop} is the mean of population counts for 1990 and 1991 +#' (counted at 1990-01-01 and 1991-01-01, respectively) +#' } +#' @name meanpop_fi +#' @family popEpi data +#' @seealso \href{http://pxnet2.stat.fi/PXWeb/pxweb/fi/StatFin/StatFin__vrm__vaerak/071_vaerak_tau_109.px/?rxid=81efcb98-00c6-46ba-9f8f-8bc6f110895f}{Table on the web} +NULL + diff --git a/R/direct_adjusting.R b/R/direct_adjusting.R index 1275b12..9fa438e 100644 --- a/R/direct_adjusting.R +++ b/R/direct_adjusting.R @@ -1,157 +1,157 @@ - - - -#' @title Direct Adjusting in \pkg{popEpi} Using Weights -#' @author Joonas Miettinen -#' @name direct_standardization -#' @aliases direct_adjusting -#' @description -#' -#' Several functions in \pkg{popEpi} have support for direct standardization -#' of estimates. This document explains the usage of weighting with those -#' functions. -#' -#' @details -#' -#' Direct standardization is performed by computing estimates of -#' \code{E} -#' by the set of adjusting variables \code{A}, to which a set of weights -#' \code{W} is applicable. The weighted average over \code{A} is then the -#' direct-adjusted estimate of \code{E} (\code{E*}). -#' -#' To enable both quick and easy as well as more rigorous usage of direct -#' standardization with weights, the weights arguments in \pkg{popEpi} -#' can be supplied in several ways. Ability to use the different -#' ways depends on the number of adjusting variables. -#' -#' The weights are always handled internally to sum to 1, so they do not -#' need to be scaled in this manner when they are supplied. E.g. -#' counts of subjects in strata may be passed. -#' -#' @section Basic usage - one adjusting variable: -#' -#' In the simple case where we are adjusting by only one variable -#' (e.g. by age group), one can simply supply a vector of weights: -#' -#' \code{FUN(weights = c(0.1, 0.25, 0.25, 0.2, 0.2))} -#' -#' which may be stored in advance: -#' -#' \code{w <- c(0.1, 0.25, 0.25, 0.2, 0.2)} -#' -#' \code{FUN(weights = w)} -#' -#' The order of the weights matters. \pkg{popEpi} functions with direct -#' adjusting enabled match the supplied weights to the adjusting variables -#' as follows: If the adjusting variable is a \code{factor}, the order -#' of the levels is used. Otherwise, the alphabetic order of the unique -#' values is used (try \code{sort} to see how it works). For clarity -#' and certainty we recommend using \code{factor} or \code{numeric} variables -#' when possible. \code{character} variables should be avoided: to see why, -#' try \code{sort(15:9)} and \code{sort(as.character(15:9))}. -#' -#' It is also possible to supply a \code{character} string corresponding -#' to one of the age group standardization schemes integrated into \pkg{popEpi}: -#' -#' \itemize{ -#' \item \code{'europe_1976_18of5'} - European std. population (1976), 18 age groups -#' \item \code{'nordic_2000_18of5'} - Nordic std. population (2000), 18 age groups -#' \item \code{'world_1966_18of5'} - world standard (1966), 18 age groups -#' \item \code{'world_2000_18of5'} - world standard (2000), 18 age groups -#' \item \code{'world_2000_20of5'} - world standard (2000), 20 age groups -#' \item \code{'world_2000_101of1'} - world standard (2000), 101 age groups -#' } -#' -#' Additionally, \code{\link{ICSS}} contains international weights used in -#' cancer survival analysis, but they are not currently usable by passing -#' a string to \code{weights} and must be supplied by hand. -#' -#' You may also supply \code{weights = "internal"} to use internally -#' computed weights, i.e. usually simply the counts of subjects / person-time -#' experienced in each stratum. E.g. -#' -#' \code{FUN(weights = "world_2000_18of5")} -#' -#' will use the world standard population from 2000 as -#' weights for 18 age groups, that your adjusting variable is -#' assumed to contain. The adjusting variable must be coded in this case as -#' a numeric variable containing \code{1:18} or as a \code{factor} with -#' 18 levels (coded from the youngest to the oldest age group). -#' -#' @section More than one adjusting variable: -#' -#' In the case that you employ more than one adjusting variable, separate -#' weights should be passed to match to the levels of the different adjusting -#' variables. When supplied correctly, "grand" weights are formed based on -#' the variable-specific weights by multiplying over the variable-specific -#' weights (e.g. if men have \code{w = 0.5} and the age group 0-4 has -#' \code{w = 0.1}, the "grand" weight for men aged 0-4 is \code{0.5*0.1}). -#' The "grand" weights are then used for adjusting after ensuring they -#' sum to one. -#' -#' When using multiple adjusting variables, you -#' are allowed to pass either a named \code{list} of -#' weights or a \code{data.frame} of weights. E.g. -#' -#' \code{WL <- list(agegroup = age_w, sex = sex_w)} -#' -#' \code{FUN(weights = WL)} -#' -#' where \code{age_w} and \code{sex_w} are numeric vectors. Given the -#' conditions explained in the previous section are satisfied, you may also do -#' e.g. -#' -#' \code{WL <- list(agegroup = "world_2000_18of", sex = sex_w)} -#' -#' \code{FUN(weights = WL)} -#' -#' and the world standard pop is used as weights for the age groups as outlined -#' in the previous section. -#' -#' Sometimes using a \code{data.frame} can be clearer (and it is fool-proof -#' as well). To do this, form a \code{data.frame} that repeats the levels -#' of your adjusting variables by each level of every other adjusting variable, -#' and assign the weights as a column named \code{"weights"}. E.g. -#' -#' \code{wdf <- data.frame(sex = rep(0:1, each = 18), agegroup = rep(1:18, 2))} -#' -#' \code{wdf$weights <- rbinom(36, size = 100, prob = 0.25)} -#' -#' \code{FUN(weights = wdf)} -#' -#' If you want to use the counts of subjects in strata as the weights, -#' one way to do this is by e.g. -#' -#' \code{wdf <- as.data.frame(x$V1, x$V2, x$V3)} -#' \code{names(wdf) <- c("V1", "V2", "V3", "weights")} -#' -#' @family weights -#' @family popEpi_argument -#' -#' @references -#' Source of the Nordic standard population in 5-year age groups -#' (also contains European & 1966 world standards): -#' \url{http://www-dep.iarc.fr/NORDCAN/english/glossary.htm} -#' -#' Source of the 1976 European standard population: -#' -#' Waterhouse, J.,Muir, C.S.,Correa, P.,Powell, J., eds (1976). -#' Cancer Incidence in Five Continents, Vol. III. -#' IARC Scientific Publications, No. 15, Lyon, IARC -#' -#' A comparison of the 1966 vs. 2000 world standard populations in 5-year age groups: -#' \url{http://www3.ha.org.hk/cancereg/e_asr.asp} -#' -#' Source of 2000 world standard population in 1-year age groups: -#' \url{http://seer.cancer.gov/stdpopulations/stdpop.singleages.html} - -NULL - - - - - - - - - + + + +#' @title Direct Adjusting in \pkg{popEpi} Using Weights +#' @author Joonas Miettinen +#' @name direct_standardization +#' @aliases direct_adjusting +#' @description +#' +#' Several functions in \pkg{popEpi} have support for direct standardization +#' of estimates. This document explains the usage of weighting with those +#' functions. +#' +#' @details +#' +#' Direct standardization is performed by computing estimates of +#' \code{E} +#' by the set of adjusting variables \code{A}, to which a set of weights +#' \code{W} is applicable. The weighted average over \code{A} is then the +#' direct-adjusted estimate of \code{E} (\code{E*}). +#' +#' To enable both quick and easy as well as more rigorous usage of direct +#' standardization with weights, the weights arguments in \pkg{popEpi} +#' can be supplied in several ways. Ability to use the different +#' ways depends on the number of adjusting variables. +#' +#' The weights are always handled internally to sum to 1, so they do not +#' need to be scaled in this manner when they are supplied. E.g. +#' counts of subjects in strata may be passed. +#' +#' @section Basic usage - one adjusting variable: +#' +#' In the simple case where we are adjusting by only one variable +#' (e.g. by age group), one can simply supply a vector of weights: +#' +#' \code{FUN(weights = c(0.1, 0.25, 0.25, 0.2, 0.2))} +#' +#' which may be stored in advance: +#' +#' \code{w <- c(0.1, 0.25, 0.25, 0.2, 0.2)} +#' +#' \code{FUN(weights = w)} +#' +#' The order of the weights matters. \pkg{popEpi} functions with direct +#' adjusting enabled match the supplied weights to the adjusting variables +#' as follows: If the adjusting variable is a \code{factor}, the order +#' of the levels is used. Otherwise, the alphabetic order of the unique +#' values is used (try \code{sort} to see how it works). For clarity +#' and certainty we recommend using \code{factor} or \code{numeric} variables +#' when possible. \code{character} variables should be avoided: to see why, +#' try \code{sort(15:9)} and \code{sort(as.character(15:9))}. +#' +#' It is also possible to supply a \code{character} string corresponding +#' to one of the age group standardization schemes integrated into \pkg{popEpi}: +#' +#' \itemize{ +#' \item \code{'europe_1976_18of5'} - European std. population (1976), 18 age groups +#' \item \code{'nordic_2000_18of5'} - Nordic std. population (2000), 18 age groups +#' \item \code{'world_1966_18of5'} - world standard (1966), 18 age groups +#' \item \code{'world_2000_18of5'} - world standard (2000), 18 age groups +#' \item \code{'world_2000_20of5'} - world standard (2000), 20 age groups +#' \item \code{'world_2000_101of1'} - world standard (2000), 101 age groups +#' } +#' +#' Additionally, \code{\link{ICSS}} contains international weights used in +#' cancer survival analysis, but they are not currently usable by passing +#' a string to \code{weights} and must be supplied by hand. +#' +#' You may also supply \code{weights = "internal"} to use internally +#' computed weights, i.e. usually simply the counts of subjects / person-time +#' experienced in each stratum. E.g. +#' +#' \code{FUN(weights = "world_2000_18of5")} +#' +#' will use the world standard population from 2000 as +#' weights for 18 age groups, that your adjusting variable is +#' assumed to contain. The adjusting variable must be coded in this case as +#' a numeric variable containing \code{1:18} or as a \code{factor} with +#' 18 levels (coded from the youngest to the oldest age group). +#' +#' @section More than one adjusting variable: +#' +#' In the case that you employ more than one adjusting variable, separate +#' weights should be passed to match to the levels of the different adjusting +#' variables. When supplied correctly, "grand" weights are formed based on +#' the variable-specific weights by multiplying over the variable-specific +#' weights (e.g. if men have \code{w = 0.5} and the age group 0-4 has +#' \code{w = 0.1}, the "grand" weight for men aged 0-4 is \code{0.5*0.1}). +#' The "grand" weights are then used for adjusting after ensuring they +#' sum to one. +#' +#' When using multiple adjusting variables, you +#' are allowed to pass either a named \code{list} of +#' weights or a \code{data.frame} of weights. E.g. +#' +#' \code{WL <- list(agegroup = age_w, sex = sex_w)} +#' +#' \code{FUN(weights = WL)} +#' +#' where \code{age_w} and \code{sex_w} are numeric vectors. Given the +#' conditions explained in the previous section are satisfied, you may also do +#' e.g. +#' +#' \code{WL <- list(agegroup = "world_2000_18of", sex = sex_w)} +#' +#' \code{FUN(weights = WL)} +#' +#' and the world standard pop is used as weights for the age groups as outlined +#' in the previous section. +#' +#' Sometimes using a \code{data.frame} can be clearer (and it is fool-proof +#' as well). To do this, form a \code{data.frame} that repeats the levels +#' of your adjusting variables by each level of every other adjusting variable, +#' and assign the weights as a column named \code{"weights"}. E.g. +#' +#' \code{wdf <- data.frame(sex = rep(0:1, each = 18), agegroup = rep(1:18, 2))} +#' +#' \code{wdf$weights <- rbinom(36, size = 100, prob = 0.25)} +#' +#' \code{FUN(weights = wdf)} +#' +#' If you want to use the counts of subjects in strata as the weights, +#' one way to do this is by e.g. +#' +#' \code{wdf <- as.data.frame(x$V1, x$V2, x$V3)} +#' \code{names(wdf) <- c("V1", "V2", "V3", "weights")} +#' +#' @family weights +#' @family popEpi argument evaluation docs +#' +#' @references +#' Source of the Nordic standard population in 5-year age groups +#' (also contains European & 1966 world standards): +#' \url{http://www-dep.iarc.fr/NORDCAN/english/glossary.htm} +#' +#' Source of the 1976 European standard population: +#' +#' Waterhouse, J.,Muir, C.S.,Correa, P.,Powell, J., eds (1976). +#' Cancer Incidence in Five Continents, Vol. III. +#' IARC Scientific Publications, No. 15, Lyon, IARC +#' +#' A comparison of the 1966 vs. 2000 world standard populations in 5-year age groups: +#' \url{http://www3.ha.org.hk/cancereg/e_asr.asp} +#' +#' Source of 2000 world standard population in 1-year age groups: +#' \url{http://seer.cancer.gov/stdpopulations/stdpop.singleages.html} + +NULL + + + + + + + + + diff --git a/R/evaluation.R b/R/evaluation.R index e691382..1641389 100644 --- a/R/evaluation.R +++ b/R/evaluation.R @@ -1,939 +1,975 @@ - - -is_expression <- function(e) { - - ifelse(any(c("call", "name") %in% class(e)), TRUE, FALSE) - -} - -is_list_expression <- function(e) { - - ifelse(is_expression(e) && deparse(as.list(e)[[1]]) == "list", TRUE, FALSE) - -} - -is_dollar_expression <- function(e) { - - ifelse(is_expression(e) && deparse(as.list(e)[[1]]) == "$", TRUE, FALSE) - -} - - -is_variable <- function(x) { - - varModes <- c("numeric", "complex", "logical", "character", "raw") - - ifelse(mode(x) %in% varModes, TRUE, FALSE) - -} - - -evalArg <- function(arg, env, enc, ...) { - UseMethod("evalArg") -} - - -evalArg.default <- function(arg, env, enc) { - if (is.list(arg)) { - arg <- as.list(arg) - } else if (is_variable(arg)) { - l <- list(arg) - } - return(arg) -} - -evalArg.name <- function(arg, env, enc) { - d <- deparse(arg) - out <- try(get(d, envir = env, inherits = FALSE), silent = TRUE) - if (inherits(out, "try-error")) { - out <- try(get(d, envir = enc, inherits = FALSE), silent = TRUE) - } - if (inherits(out, "try-error")) { - stop("Could not find object ", d, ".") - } - out <- list(out) - names(out) <- d - out -} - -evalArg.call <- function(arg, env, enc) { - - out <- eval(arg, envir = env, enclos = enc) - - if (is.list(out)) { - out <- as.list(out) - } else if (is_variable(out)) { - out <- list(out) - names(out) <- paste0(deparse(arg), collapse = "") - } - out -} - -evalArg.character <- function(arg, env, enc) { - ## NOTE: enc unused - se <- substitute(env) - out <- lapply(arg, function(stri) { - try({ - get(stri, envir = as.environment(env), inherits = FALSE) - }, silent = TRUE) - }) - - notFound <- arg[sapply(out, inherits, "try-error")] - - if (length(notFound)) { - if (length(notFound) > 5) notFound <- notFound[1:5] - stop("Could not find object(s): ", - paste0(notFound, collapse = ", "), ".") - } - names(out) <- arg - out -} - -evalArg.formula <- function(arg, env, enc) { - - rhsl <- as.list(RHS2DT(arg, data = env, enclos = enc)) - - rhsl - -} - - -method_classes <- function(f) { - - stopifnot(is.character(f)) - e <- utils::methods(f) - e <- unlist(lapply(e, as.character)) - e <- unlist(lapply(e, sub, pattern = paste0(f, "."), replacement = "")) - setdiff(e, "default") - -} - - -do_evalPopArg <- function(arg, env, enc) { - eam <- method_classes("evalArg") - - ne <- list(env = env, enc = enc) - de <- list(env = enc, enc = baseenv()) - - r <- arg - tick <- 1L - while (any(class(r) %in% eam)) { - - envs <- if (is_dollar_expression(r)) de else ne - r <- evalArg(arg = r, env = envs$env, enc = envs$enc) - - tick <- tick + 1L - if (tick == 100L) stop("No result after 100 evaluations") - } - - r -} - - - -argType <- function(arg) { - - tl <- list("NULL" = "NULL", character = "character", - list = "call", formula = "formula", - expression = c("call", "name")) - - tl <- sapply(tl, function(ch) { - t <- tryCatch(inherits(arg, ch), - error = function(e) e, - warning = function(w) w) - isTRUE(t) - }) - if (!any(tl)) tl[names(tl) == "expression"] <- TRUE - if (tl["list"]) tl["list"] <- substr(deparse(arg), 1, 5) == "list(" - names(tl)[tl & !duplicated(tl)] - -} - - - - - -evalPopArg2 <- function(data, arg, enclos, DT = TRUE, - types = c("NULL","character", "list", "expression")) { - - allowed_types <- c("NULL", "character", "list", "expression", "formula") - types <- match.arg(types, allowed_types, - several.ok = TRUE) - if (!argType(arg) %in% types) { - stop("Supplied argument not allowed type. Current type: ", - argType(arg), ". Allowed types: ", - paste0(types, collapse = ", ")) - } - - l <- do_evalPopArg(arg = arg, env = data, enc = enclos) - l -} - - - - -evalPopArg <- function(data, arg, n = 1L, DT = TRUE, enclos = NULL, recursive = TRUE, types = c("NULL","character", "list", "expression"), naming = c("DT", "model")) { - ## arg: an unevaluated AND substitute()'d argument within a function, which may be - ## * an expression - ## * a list of expressions - ## * a character vector of variable names (in a given data set) - ## n: steps upstream as in parent.frame(n); 0L refers to calling environment - ## of evalPopArg, 1L to calling environment of e.g. sir which uses evalPopArg, etc. - ## hence n = 1L should be almost always the right way to go. - ## ALTERNATIVELY supply an environment by hand via enclos. - ## enclos will override n. - ## recursive: if TRUE, evals arg as many times as it is of type language. - ## output: - ## * vector as a result of an expression - ## * list as a result of a list - ## * character vector of names - ## OR with DT = TRUE, a data.table based on aforementioned results. - ## intention: output to be used in by argument of data.table. - ## a data.table output is directly usable in by. - ## if column names cannot be easily found, BV1, BV2, ... are imputed - ## for missing names (unrobustly: such names may already exist, resulting in duplicates) - - ## naming: DT style uses first element of all.names() where - ## a name has to be created; model style keeps the whole deparsed - ## expression. Only applied when DT = TRUE - naming <- match.arg(naming[1L], c("DT", "model")) - - ## types: allowed popArg types of arguments. - types <- match.arg(types, c("NULL","character", "list", "expression", "formula"), several.ok = TRUE) - - if (!is.null(enclos) && !is.environment(enclos)) { - stop("enclos must be NULL or an environment") - } - if (!is.environment(enclos)) enclos <- parent.frame(n + 1L) - - ## used data may change if expression uses dollar operator, hence - ## arg should not be evaluated within data but only its surroundings. - use_data <- data - use_enc <- enclos - dataNames <- names(data) - - if (uses_dollar(arg, data.names = dataNames)) { - use_data <- enclos - use_enc <- baseenv() - } - e <- eval(arg, envir = use_data, enclos = use_enc) - if (is.language(e) && !inherits(e, "formula")) { - if (!recursive) stop("arg is of type language after evaluating, and recursive = FALSE") - - tick <- 1L - while (is.language(e) && !inherits(e, "formula") && tick < 100L) { - arg <- e - use_data <- data - use_enc <- enclos - if (uses_dollar(arg, data.names = dataNames)) { - use_data <- enclos - use_enc <- baseenv() - } - e <- eval(arg, envir = use_data, enclos = use_enc) - tick <- tick + 1L - } - if (tick == 100L) stop("arg was of type language even after 100 evaluations. Something went wrong here...") - - - - } - argType <- "NULL" - if (is.list(e)) argType <- "list" else - if (is.character(e)) argType <- "character" else - if (mode(e) == "numeric" || is.vector(e) || is.factor(e)) argType <- "expression" else - if (inherits(e, "formula")) argType <- "formula" - - if (!argType %in% types) stop("popArg type of evaluated arg not one of the allowed types (set via argument types). Detected type: '", argType, "'. Allowed types: ", paste0("'", types, "'", collapse = ", ")) - - if (argType == "NULL") return(NULL) - - av <- all.vars(arg) - if (argType == "character") av <- e - - ## byNames: names of columns resulting from aggre argument, by which - ## pyrs and such are aggregated. same functionality - ## as in results seen in e.g.DT[, .N, by = list(factor(x), y, z = w)] ## factor, y, z - ## note: first object in ags with list or expression aggre is "list" - byNames <- NULL - - if (is.character(e)) byNames <- e - else if (argType == "list" && substr(paste0(deparse(arg)), 1, 5) == "list(") byNames <- sapply(arg[-1], function(x) all.names(x)[1]) - else if (argType == "expression") byNames <- all.names(arg)[1] - - badNames <- c("$", ":") - - byNames[byNames %in% badNames] <- paste0("BV", 1:length(byNames))[byNames %in% badNames] - - if (argType == "formula") { - arg <- e - use_data <- data - use_enc <- enclos - e <- RHS2DT(formula = e, data = use_data, enclos = use_enc) - if (ncol(e) == 0L || nrow(e) == 0L) e <- data.table() ## e.g. y ~ 1 - - } else if (is.character(e)) { - all_names_present(data, e) - if (DT) { - ## note: e contains variable names in character strings, - ## ergo fully named list & DT created - l <- lapply(e, function(x) data[[x]]) - setattr(l, "names", e) - setDT(l) - e <- l; rm(l) - } - } else if (is.list(e)) { - ## note: fully unnamed list has NULL names() - ## partially named list has some "" names - ne <- names(e) - - if (DT && any(sapply(e, is.null))) stop("at least one object in list arg is NULL; cannot form data.table with such list") - - if (is.null(ne)) ne <- rep("", length(e)) - - - wh_bad <- which(ne == "") - if (length(wh_bad) > 0) { - if (is.null(byNames)) { - byNames <- paste0("BV", 1:length(e)) - } - - ne[wh_bad] <- byNames[wh_bad] - setattr(e, "names", ne) - } - - if (DT) { - ## NOTE: used to be setDT, but length of different elements - ## in list may differ, which as.data.table handles correctly - e <- as.data.table(e) - } - } else if (mode(e) == "numeric" || is.vector(e) || is.factor(e)) { - ## is e.g. a numeric vector or a factor - if (DT) { - e <- data.table(V1 = e) - setnames(e, 1, byNames) - } - } - - ## NOTE: e may be of type language at this point if arg was double-quoted - ## and recursive = FALSE - - if (DT) { - setDT(e) - setattr(e, "all.vars", av) - setattr(e, "quoted.arg", arg) - setattr(e, "arg.type", argType) - if (naming == "model" && ncol(e) > 0L) setnames(e, 1:ncol(e), popArg2ModelNames(arg, type = argType)) - } - e -} - - -popArgType <- function(arg, data = NULL, n = 1L, enclos = NULL, recursive = TRUE) { - ## input: a substitute()'d expression / argument - ## NOTE: recursive useful when arg might be quoted twice and want the eventual - ## result; need to supply data for it though - ## output: type of thingie that was substitute()'d - ## * list (of expressions) - ## * character string vector - ## * an expression (includes symbol) - av <- all.vars(arg, unique = TRUE) ## all variables - av <- setdiff(av, c("$", "T", "F")) - an <- all.names(arg, unique = TRUE) ## all variables and functions - af <- setdiff(an, av) ## all functions used - - a <- deparse(arg) - a <- paste0(a, collapse = "") ## lists may somehow produce length > 1 here - if (substr(a, 1, 5) == "list(") return("list") - if (a == "NULL") return("NULL") - ## detection of character arguments is not easy and should not be considered - ## fool proof since user may pass e.g. a vector of character strings as a - ## symbol, which can only really be interpreted as an expression - if (sum(grep('\\"', a)) && length(setdiff(af, "c")) == 0) return("character") - - if (is.data.frame(data)) { - if (is.symbol(arg) && a %in% names(data)) return("expression") - if (length(av) == 1L && av %in% names(data)) return("expression") - e <- eval(arg, envir = data[1:min(nrow(data), 20L), ], - enclos = if (is.environment(enclos)) enclos else parent.frame(n + 1L)) - if (inherits(e, "formula")) return("formula") - if (is.null(e)) return("NULL") - if (is.list(e)) return("list") - if (is.character(e) && all(e %in% names(data))) return("character") - if (is.vector(e) || is.factor(e)) return("expression") - - if (recursive && is.language(e)) return(popArgType(e, data = data, n = n + 1L, enclos = enclos)) - } - - "expression" - -} -popArg2ModelNames <- function(arg, type) { - ## INTENTION: given a quoted/substituted expression, - ## digs out the expression(s) creating a/multiple column(s) - ## and returns the deparsed expression(s) to be used as names - ## of columns the same way that models such as lm() display - ## the names of expressions used within formula - - ## some exceptions - if (is.data.frame(arg)) return(names(arg)) - if (is.character(arg)) return(arg) - - type <- match.arg(type[1L], c("NULL", "character", "list", "expression", "formula")) - - lang <- NULL - lang <- try(is.language(arg) || inherits(arg, "formula"), silent = TRUE) - - - if (inherits(lang, "try-error") || !lang) stop("arg must be a quoted or substituted expression or a formula. Error message: ", lang, ". type of arg: ", typeof(arg), ". Class: ", class(arg), ". Mode: ", mode(arg), ".") - - d <- oneWhitespace(paste0(deparse(arg))) - - if (type == "expression") return(d) else - if (type == "NULL") return(NULL) else - if (type == "character") return(eval(arg)) else - if (type == "list") { - d <- substr(d, 6, nchar(d)-1L) ## removes "list(" and ")" - d <- strsplit(d, ", ") - return(unlist(d)) - } else if (type == "formula") { - arg <- eval(arg) - d <- names(RHS2list(arg)) - if (length(d) == 0L) return(NULL) ## e.g. y ~ 1 - return(d) - } - stop("could not determine deparsed-expression-names") -} - - - - - -uses_dollar <- function(q, data.names) { - ## INTENTION: determine whether q is an expressions that is evaluated - ## outside a data.frame, i.e. one that uses the dollar operator. - ## e.g. TF$V1 should not be evaluated in a data.frame even if it has - ## the variables TF and V1 since it wont work and was not intended. - if (!is.language(q) || inherits(q, "formula")) { - return(FALSE) - } - - d <- deparse(q) - ## sometimes d is of length > 1 for some reason... - d <- paste0(d, collapse = "") - d <- oneWhitespace(d) - - if (substr(d, 1, 4) == "list") { - ## lists are not allowed to work in this manner for now. - return(FALSE) - } - - if (!grepl(x = d, pattern = "\\$")) { - ## does not use dollar operator. - return(FALSE) - } - - ## detect if word$word is used in d - t <- regexec(pattern = "\\w+\\$\\w+", text = d) - if (t != -1) { - ## ok, used word$word - ## is there are variable with that name in data.names? - m <- unlist(regmatches(d, t)) - if (m %in% data.names) { - return(FALSE) - } - ## if not, it should be evaluated outside the data. - return(TRUE) - } - - return(FALSE) -} - - - - -`%.:%` <- function(x, y) { - ## INTENTION: hacking formula calls using `:` - ## which is apparently normally evaluated in C... (in model.matrix.default) - ## USAGE: e.g. c(1,2) %.:% c(3,4) = c(3, 8) - ## (instead of getting warning) - if (length(x) > 1L && length(y) > 1L && is.numeric(x) && is.numeric(y)) { - return(x*y) - } else if (length(x) == 1L && length(y) == 1L && is.numeric(x) && is.numeric(y)) { - return(x:y) - } - as.factor(x):as.factor(y) - -} - - - -RHS2list <- function(formula, handle.adjust=TRUE) { - ## INTENTION: turns the right-hand side of a formula - ## into a list of substituted expressions; - ## each element in list is an expressions separated - ## by a '+' in the formula. needs to be eval()'d, - ## preferably using the appropriate data set. - if (!inherits(formula, "formula")) stop("not a formula") - - ## no response - formula <- formula[c(1, length(formula))] - - te <- terms(formula) - tl <- attr(te, "term.labels") - - ## handle adjusting variables (e.g. adjust(V1, V2) -> c("V1", "V2")) - adj <- tl[substr(tl, 1, 7) == "adjust("] - if (length(adj) == 0L) adj <- NULL - if (handle.adjust && !is.null(adj)) { - tl <- setdiff(tl, adj) - adj <- unlist(lapply(adj, function(stri) { - e <- parse(text = stri)[[1]] - e <- as.list(e)[-1] - unlist(lapply(e, deparse)) - })) - - tl <- c(tl, adj) - } - - ## to avoid e.g. c(1,2):c(3,4) NOT evaluating as c(3, 8) - l <- lapply(tl, function(x) gsub(pattern = ":", x = x, replacement = "%.:%")) - l <- lapply(l, function(x) parse(text = x)[[1L]]) - - names(l) <- tl - - setattr(l, "adjust", adj) - - l -} - -uses_colon <- function(e) { - if (is.character(e)) e <- parse(text = e)[[1]] - stopifnot(is.call(e) || is.name(e)) - stopifnot(!inherits(e, "formula")) - - l <- as.list(e) - if (deparse(l[[1]]) %in% c(":", "%.:%") && length(l) == 3) { - return(TRUE) - } - FALSE -} - -replace_colon <- function(e) { - stopifnot(is.call(e) || is.name(e)) - stopifnot(!inherits(e, "formula")) - - if (!uses_colon(e)) return(e) - - l <- as.list(e) - l[[1]] <- parse(text = "popEpi:::`%.:%`")[[1]] - as.call(l) -} - -RHS2DT <- function(formula, data = data.frame(), enclos = parent.frame(1L)) { - l <- RHS2list(formula) - if (length(l) == 0L) return(data.table()) - adj <- attr(l, "adjust") - - ## foolproofing in case data contains column named e.g. "a:b" and that is - ## intended to be used - dana <- names(data) - dana <- gsub(x=dana, pattern=" %.:% ", replacement = ":") - dana <- gsub(x=dana, pattern="%.:%", replacement = ":") - - - ld <- lapply(l, deparse) - ld <- lapply(ld, function(ch) { - ch <- gsub(x=ch, pattern=" %.:% ", replacement = ":") - ch <- gsub(x=ch, pattern="%.:%", replacement = ":") - int <- if (ch %in% dana) which(dana %in% ch) else ch - if (is.integer(int)) data[[names(data)[int]]] else NULL - }) - ld[which(sapply(ld, is.null))] <- NULL - l[names(ld)] <- ld - - ## foolproofs use of function %.:% by explicit referral, i.e. ::: - ## (this avoids scoping problem) - l <- lapply(l, function(elem) { - if (!is.call(elem)) return(elem) - if (!uses_colon(elem)) return(elem) - - replace_colon(elem) - - }) - - l <- lapply(l, function(elem) { - eval(expr = elem, envir = data, enclos = enclos) - }) - - l <- as.data.table(l) - setattr(l, "adjust", adj) - l -} - -Surv2DT <- function(Surv) { - sa <- attributes(Surv) - dt <- copy(Surv) - setattr(dt, "class", "array") - dt <- data.table(dt) - - type <- attr(Surv, "type") - statNA <- sum(is.na(dt$status)) - if (statNA) { - stop("Some status indicators (", statNA ," values in total) were NA as ", - "a result of using Surv(). Usual suspects: original status variable ", - "has NA values, or you have numeric status variable with more than ", - "two levels and you did not assign e.g. type = 'mstate' (e.g. ", - "Surv(time = c(1,1,1), event = c(0,1,2), type = 'mstate') works).") - - } - - - setattr(dt, "type", type) - testClass <- sa$inputAttributes$time2$class - if (!is.null(testClass) && testClass == "factor") - dt[, status := factor(status, labels = sa$inputAttributes$time2$levels)] - testClass <- sa$inputAttributes$event$class - if (!is.null(testClass) && testClass == "factor") - dt[, status := factor(status, labels = sa$inputAttributes$event$levels)] - - dt[] -} - - - -evalPopFormula <- function(formula, data = data.frame(), enclos = parent.frame(2L), subset = NULL, Surv.response = TRUE) { - - ## INTENTION: given a formula object, returns a DT where each column - ## is an evaluated expression from formula (separated by + ) - - fe <- environment(formula) - - either <- FALSE - if (is.character(Surv.response)) { - Surv.response <- match.arg(Surv.response, "either") - Surv.response <- TRUE - either <- TRUE - } else if (!is.logical(Surv.response)) { - stop("Surv.response must be either logical or 'either'") - } - - ## subset if needed ---------------------------------------------------------- - if (!is.null(subset) && !is.logical(subset)) { - stop("subset must be NULL or a logical vector and not an expression at ", - "this point. If you see this, complain to the package maintainer.") - } - if (!is.null(subset)) { - keepVars <- c(all.vars(formula), "lex.Xst") - data <- subsetDTorDF(data, subset = subset, select = keepVars) - } - - - ## formula ------------------------------------------------------------------- - if (!inherits(formula, "formula")) { - stop("formula is not of class 'formula'; supply it as e.g. y ~ x") - } - if (length(formula) < 3L) { - stop("formula appears to be one-sided, which is not supported; ", - "supply it as e.g. y ~ x") - } - - ## response - y <- eval(formula[[2L]], envir = data, enclos = enclos) - if (inherits(y, "Surv") && !either && !Surv.response) { - stop("Response is a result of using Surv(), which is not allowed in ", - "this context.") - } - - if (!inherits(y, "Surv") && !either && Surv.response) { - stop("The response of the formula must be a Surv object; ", - "see ?Surv (in package survival).") - } - - if (inherits(y, "Surv")) { - y <- Surv2DT(y) - setcolsnull(y, keep = c("time", "start", "status"), colorder = TRUE) - if (!any(c("time", "start") %in% names(y))) { - stop("You must supply function Surv a value to the 'time' ", - "argument. See ?Surv") - } - setnames(y, names(y), c("time", "status")[1:ncol(y)]) - } else { - y <- data.table(y) - setnames(y, 1, deparse(formula[[2L]])) - if (either && inherits(data, "Lexis")) { - ## we assume the unmodified lex.Xst to be a useful status variable. - if (!"lex.Xst" %in% names(data)) { - stop("Supplied a formula without using Surv(), and data was a Lexis ", - "object, so assumed you intended to use 'lex.Xst' in data as the ", - "status variable in this context, but that column was missing ", - "from data.") - } - setnames(y, 1, "time") - y[, "status"] <- data$lex.Xst - - } - } - - - ## RHS - l <- RHS2DT(formula, data = data, enclos = enclos) - adj <- attr(l, "adjust") - - ## combine - l <- if (length(l) > 0L) cbind(y, l) else y - - setattr(l, "adjust.names", adj) - setattr(l, "print.names", setdiff(names(l), c(adj, names(y)))) - setattr(l, "Surv.names", names(y)) - setattr(l, "formula", formula) - - l -} - - -evalRecursive <- function(arg, env, enc, max.n = 100L) { - ## INTENTION: digs out actual evaluatable value and expression - - if (missing(env)) env <- environment() - if (missing(enc)) enc <- parent.frame(1L) - - if (is.data.frame(env)) { - na <- names(env) - env <- env[1:(min(10L, nrow(env))), ] - - env <- data.frame(env) - setattr(env, "names", na) - - } - argSub <- arg - - tick <- 1L - while (!inherits(arg, "try-error") && is.language(arg) && - !inherits(arg, "formula") && tick < max.n) { - - argSub <- arg - arg <- try(eval(argSub, envir = env, enclos = enc), silent = TRUE) - - tick <- tick + 1L - } - - if (tick == max.n) { - stop("evaluated expression ", max.n, - " times and still could not find underlying expression") - } - if (!is.language(argSub)) argSub <- substitute(arg) - list(arg = arg, argSub = argSub, all.vars = all.vars(argSub)) -} - - -usePopFormula <- function(form = NULL, adjust = NULL, data = data.frame(), - enclos, Surv.response = TRUE) { - ## INTENTION: evaluates form and combines with adjust appropriately - ## returns a list of the elements dug out from the formula and adjust - ## arguments. - # formSub <- substitute(form) - al <- evalRecursive(arg = form, env = data, enc = enclos) - - if (!inherits(al$arg, "formula")) stop("'form' is not a formula object") - - dt <- evalPopFormula(formula = al$arg, data = data, enclos = enclos, - Surv.response = Surv.response) - adNames <- attr(dt, "adjust.names") - prNames <- attr(dt, "print.names") - suNames <- attr(dt, "Surv.names") - - adjust <- evalPopArg(data, adjust, DT = TRUE, recursive = TRUE, - enclos = new.env(), naming = "model", - types = c("NULL", "character", "list", "expression")) - - if (is.data.frame(adjust) && (nrow(adjust) == 0L || ncol(adjust) == 0L)) { - stop("adjust evaluated to an empty data.frame") - } - if (!is.null(adjust) && ncol(adjust) > 0L && length(adNames) > 0L) { - stop("Cannot both use argument 'adjust' AND use an adjust() term within ", - "the formula argument. Please only use one.") - } - if (is.null(adjust) && length(adNames) > 0L) { - adjust <- dt[, .SD, .SDcols = c(adNames)] - } - - - print <- NULL - if (length(prNames > 0L)) print <- dt[, .SD, .SDcols = eval(prNames)] - - list(y = dt[, .SD, .SDcols = c(suNames)], - print = print, - adjust = adjust, formula = al$arg) -} - - - - - -#' @title Adjust Estimates by Categorical Variables -#' @description This function is only intended to be used within a formula -#' when supplied to e.g. \code{\link{survtab_ag}} and should not be -#' used elsewhere. -#' @param ... variables to adjust by, e.g. \code{adjust(factor(v1), v2, v3)} -#' @return Returns a list of promises of the variables supplied which can be -#' evaluated. -#' @examples -#' -#' y ~ x + adjust(z) -#' @export -adjust <- function(...) { - - call <- sys.call(1L) - call <- as.list(call)[1L] - - if (deparse(call) %in% c("adjust", "list(adjust)")) { - stop("Function adjust() only intended to be used within the formulas of ", - "certain functions of package popEpi. See e.g. ?survtab_ag for usage.") - } - - mc <- as.list(match.call())[-1L] - if (is.list(mc) && length(mc) == 1) mc <- mc[[1]] - mc -} - - - - - -parse_adjust_formula <- function(f) { - - env <- environment(f) - t <- attr(terms(f), "term.labels") - - l <- sapply(t, grepl, pattern = "adjust(", fixed = TRUE) - - a <- t[l] - - if (!any(l)) return(f) - - f <- deparse(f) - f <- paste0(f, collapse = "") - f <- oneWhitespace(f) - - for (k in seq_along(a)) { - f <- sub(x = f, pattern = paste0(" + ", a[k]), replacement = "", fixed = TRUE) - } - - a <- lapply(a, function(stri) { - e <- parse(text = stri)[[1]] - as.list(e)[-1] - }) - - a <- unlist(a, recursive = FALSE) - - a <- sapply(a, function(e) { - - e <- substitute(adjust(e), list(e = e)) - deparse(e) - - }) - - f <- paste0(f, " + ", paste0(a, collapse = " + ")) - f <- eval(parse(text = f)[[1]]) - environment(f) <- env - f -} - - - - - -model_frame_robust <- function(formula, data, enc) { - - stopifnot(inherits(formula, "formula")) - stopifnot(is.environment(enc)) - - pe <- function(x, ...) { - eval(parse(text = x), ...) - } - - fo <- formula - - te <- terms(fo) - tl <- attr(te, "term.labels") - - av <- all.vars(fo) - av <- intersect(av, names(data)) - - ## non-interactions as they are - l <- lapply(tl, function(stri) { - if (stri %in% names(data)) return(data[[stri]]) - e <- try(pe(stri, envir = data, enclos = enc), silent = TRUE) - if (inherits(e, "try-error")) e <- stri - e - }) - - fa <- attr(te, "factors") - - whInter <- colnames(fa)[colSums(fa) > 1] - - whInter <- which(sapply(l, function(elem) { - is.character(elem) && length(elem) == 1L && elem %in% whInter - })) - whVar <- setdiff(seq_along(l), whInter) - - if (sum(whInter)) { - - fa <- fa[, whInter, drop = FALSE] - interList <- lapply(colnames(fa), function(stri) { - - e <- parse(text = stri)[[1]] - e <- as.list(e)[-1] - e <- sapply(e, deparse) - - }) - - names(interList) <- colnames(fa) - - interVars <- unique(unlist(interList)) - - - interData <- lapply(interVars, function(stri) { - if (stri %in% names(data)) return(data[[stri]]) - eval(parse(text = stri), envir = data, enclos = enc) - }) - - interData <- setDT(lapply(interData, as.factor)) - names(interData) <- interVars - - on <- copy(names(interData)) - tn <- paste0("V", seq_along(interData)) - names(tn) <- on - names(interData) <- tn - - til <- lapply(interList, function(stri) { - tn[stri] - }) - names(til) <- sapply(til, paste0, collapse = ":") - - - - l[whInter] <- lapply(seq_along(til), function(i) { - - tmpExpr <- til[[i]] - realExpr <- l[whInter][[i]] - e <- try(pe(tmpExpr, envir = interData), silent = TRUE) - - if (inherits(e, "try-error")) { - e <- pe(realExpr, envir = data, enclos = enc) - } - e - }) - - } - - names(l) <- tl - l <- as.data.table(l) - l -} - - - - - - + + +is_expression <- function(e) { + + ifelse(any(c("call", "name") %in% class(e)), TRUE, FALSE) + +} + +is_list_expression <- function(e) { + + ifelse(is_expression(e) && deparse(as.list(e)[[1]]) == "list", TRUE, FALSE) + +} + +is_dollar_expression <- function(e) { + + ifelse(is_expression(e) && deparse(as.list(e)[[1]]) == "$", TRUE, FALSE) + +} + + +is_variable <- function(x) { + + varModes <- c("numeric", "complex", "logical", "character", "raw") + + ifelse(mode(x) %in% varModes, TRUE, FALSE) + +} + + +evalArg <- function(arg, env, enc, ...) { + UseMethod("evalArg") +} + + +evalArg.default <- function(arg, env, enc) { + if (is.list(arg)) { + arg <- as.list(arg) + } else if (is_variable(arg)) { + l <- list(arg) + } + return(arg) +} + +evalArg.name <- function(arg, env, enc) { + d <- deparse(arg) + out <- try(get(d, envir = env, inherits = FALSE), silent = TRUE) + if (inherits(out, "try-error")) { + out <- try(get(d, envir = enc, inherits = FALSE), silent = TRUE) + } + if (inherits(out, "try-error")) { + stop("Could not find object ", d, ".") + } + out <- list(out) + names(out) <- d + out +} + +evalArg.call <- function(arg, env, enc) { + + out <- eval(arg, envir = env, enclos = enc) + + if (is.list(out)) { + out <- as.list(out) + } else if (is_variable(out)) { + out <- list(out) + names(out) <- paste0(deparse(arg), collapse = "") + } + out +} + +evalArg.character <- function(arg, env, enc) { + ## NOTE: enc unused + se <- substitute(env) + out <- lapply(arg, function(stri) { + try({ + get(stri, envir = as.environment(env), inherits = FALSE) + }, silent = TRUE) + }) + + notFound <- arg[sapply(out, inherits, "try-error")] + + if (length(notFound)) { + if (length(notFound) > 5) notFound <- notFound[1:5] + stop("Could not find object(s): ", + paste0(notFound, collapse = ", "), ".") + } + names(out) <- arg + out +} + +evalArg.formula <- function(arg, env, enc) { + + rhsl <- as.list(RHS2DT(arg, data = env, enclos = enc)) + + rhsl + +} + + +method_classes <- function(f) { + + stopifnot(is.character(f)) + e <- utils::methods(f) + e <- unlist(lapply(e, as.character)) + e <- unlist(lapply(e, sub, pattern = paste0(f, "."), replacement = "")) + setdiff(e, "default") + +} + + +do_evalPopArg <- function(arg, env, enc) { + eam <- method_classes("evalArg") + + ne <- list(env = env, enc = enc) + de <- list(env = enc, enc = baseenv()) + + r <- arg + tick <- 1L + while (any(class(r) %in% eam)) { + + envs <- if (is_dollar_expression(r)) de else ne + r <- evalArg(arg = r, env = envs$env, enc = envs$enc) + + tick <- tick + 1L + if (tick == 100L) stop("No result after 100 evaluations") + } + + r +} + + + +argType <- function(arg) { + + tl <- list("NULL" = "NULL", character = "character", + list = "call", formula = "formula", + expression = c("call", "name")) + + tl <- sapply(tl, function(ch) { + t <- tryCatch(inherits(arg, ch), + error = function(e) e, + warning = function(w) w) + isTRUE(t) + }) + if (!any(tl)) tl[names(tl) == "expression"] <- TRUE + if (tl["list"]) tl["list"] <- substr(deparse(arg), 1, 5) == "list(" + names(tl)[tl & !duplicated(tl)] + +} + + + + + +evalPopArg2 <- function(data, arg, enclos, DT = TRUE, + types = c("NULL","character", "list", "expression")) { + + allowed_types <- c("NULL", "character", "list", "expression", "formula") + types <- match.arg(types, allowed_types, + several.ok = TRUE) + if (!argType(arg) %in% types) { + stop("Supplied argument not allowed type. Current type: ", + argType(arg), ". Allowed types: ", + paste0(types, collapse = ", ")) + } + + l <- do_evalPopArg(arg = arg, env = data, enc = enclos) + l +} + + + + +evalPopArg <- function(data, arg, n = 1L, DT = TRUE, enclos = NULL, recursive = TRUE, types = c("NULL","character", "list", "expression"), naming = c("DT", "model")) { + ## arg: an unevaluated AND substitute()'d argument within a function, which may be + ## * an expression + ## * a list of expressions + ## * a character vector of variable names (in a given data set) + ## n: steps upstream as in parent.frame(n); 0L refers to calling environment + ## of evalPopArg, 1L to calling environment of e.g. sir which uses evalPopArg, etc. + ## hence n = 1L should be almost always the right way to go. + ## ALTERNATIVELY supply an environment by hand via enclos. + ## enclos will override n. + ## recursive: if TRUE, evals arg as many times as it is of type language. + ## output: + ## * vector as a result of an expression + ## * list as a result of a list + ## * character vector of names + ## OR with DT = TRUE, a data.table based on aforementioned results. + ## intention: output to be used in by argument of data.table. + ## a data.table output is directly usable in by. + ## if column names cannot be easily found, BV1, BV2, ... are imputed + ## for missing names (unrobustly: such names may already exist, resulting in duplicates) + + ## naming: DT style uses first element of all.names() where + ## a name has to be created; model style keeps the whole deparsed + ## expression. Only applied when DT = TRUE + naming <- match.arg(naming[1L], c("DT", "model")) + + ## types: allowed popArg types of arguments. + types <- match.arg(types, c("NULL","character", "list", "expression", "formula"), several.ok = TRUE) + + if (!is.null(enclos) && !is.environment(enclos)) { + stop("enclos must be NULL or an environment") + } + if (!is.environment(enclos)) enclos <- parent.frame(n + 1L) + + ## used data may change if expression uses dollar operator, hence + ## arg should not be evaluated within data but only its surroundings. + use_data <- data + use_enc <- enclos + dataNames <- names(data) + + if (uses_dollar(arg, data.names = dataNames)) { + use_data <- enclos + use_enc <- baseenv() + } + e <- eval(arg, envir = use_data, enclos = use_enc) + if (is.language(e) && !inherits(e, "formula")) { + if (!recursive) stop("arg is of type language after evaluating, and recursive = FALSE") + + tick <- 1L + while (is.language(e) && !inherits(e, "formula") && tick < 100L) { + arg <- e + use_data <- data + use_enc <- enclos + if (uses_dollar(arg, data.names = dataNames)) { + use_data <- enclos + use_enc <- baseenv() + } + e <- eval(arg, envir = use_data, enclos = use_enc) + tick <- tick + 1L + } + if (tick == 100L) stop("arg was of type language even after 100 evaluations. Something went wrong here...") + + + + } + argType <- "NULL" + if (is.list(e)) argType <- "list" else + if (is.character(e)) argType <- "character" else + if (mode(e) == "numeric" || is.vector(e) || is.factor(e)) argType <- "expression" else + if (inherits(e, "formula")) argType <- "formula" + + if (!argType %in% types) stop("popArg type of evaluated arg not one of the allowed types (set via argument types). Detected type: '", argType, "'. Allowed types: ", paste0("'", types, "'", collapse = ", ")) + + if (argType == "NULL") return(NULL) + + av <- all.vars(arg) + if (argType == "character") av <- e + + ## byNames: names of columns resulting from aggre argument, by which + ## pyrs and such are aggregated. same functionality + ## as in results seen in e.g.DT[, .N, by = list(factor(x), y, z = w)] ## factor, y, z + ## note: first object in ags with list or expression aggre is "list" + byNames <- NULL + + if (is.character(e)) byNames <- e + else if (argType == "list" && substr(paste0(deparse(arg)), 1, 5) == "list(") byNames <- sapply(arg[-1], function(x) all.names(x)[1]) + else if (argType == "expression") byNames <- all.names(arg)[1] + + badNames <- c("$", ":") + + byNames[byNames %in% badNames] <- paste0("BV", 1:length(byNames))[byNames %in% badNames] + + if (argType == "formula") { + arg <- e + use_data <- data + use_enc <- enclos + e <- RHS2DT(formula = e, data = use_data, enclos = use_enc) + if (ncol(e) == 0L || nrow(e) == 0L) e <- data.table() ## e.g. y ~ 1 + + } else if (is.character(e)) { + all_names_present(data, e) + if (DT) { + ## note: e contains variable names in character strings, + ## ergo fully named list & DT created + l <- lapply(e, function(x) data[[x]]) + setattr(l, "names", e) + setDT(l) + e <- l; rm(l) + } + } else if (is.list(e)) { + ## note: fully unnamed list has NULL names() + ## partially named list has some "" names + ne <- names(e) + + if (DT && any(sapply(e, is.null))) stop("at least one object in list arg is NULL; cannot form data.table with such list") + + if (is.null(ne)) ne <- rep("", length(e)) + + + wh_bad <- which(ne == "") + if (length(wh_bad) > 0) { + if (is.null(byNames)) { + byNames <- paste0("BV", 1:length(e)) + } + + ne[wh_bad] <- byNames[wh_bad] + setattr(e, "names", ne) + } + + if (DT) { + ## NOTE: used to be setDT, but length of different elements + ## in list may differ, which as.data.table handles correctly + e <- as.data.table(e) + } + } else if (mode(e) == "numeric" || is.vector(e) || is.factor(e)) { + ## is e.g. a numeric vector or a factor + if (DT) { + e <- data.table(V1 = e) + setnames(e, 1, byNames) + } + } + + ## NOTE: e may be of type language at this point if arg was double-quoted + ## and recursive = FALSE + + if (DT) { + setDT(e) + setattr(e, "all.vars", av) + setattr(e, "quoted.arg", arg) + setattr(e, "arg.type", argType) + if (naming == "model" && ncol(e) > 0L) setnames(e, 1:ncol(e), popArg2ModelNames(arg, type = argType)) + } + e +} + + +popArgType <- function(arg, data = NULL, n = 1L, enclos = NULL, recursive = TRUE) { + ## input: a substitute()'d expression / argument + ## NOTE: recursive useful when arg might be quoted twice and want the eventual + ## result; need to supply data for it though + ## output: type of thingie that was substitute()'d + ## * list (of expressions) + ## * character string vector + ## * an expression (includes symbol) + av <- all.vars(arg, unique = TRUE) ## all variables + av <- setdiff(av, c("$", "T", "F")) + an <- all.names(arg, unique = TRUE) ## all variables and functions + af <- setdiff(an, av) ## all functions used + + a <- deparse(arg) + a <- paste0(a, collapse = "") ## lists may somehow produce length > 1 here + if (substr(a, 1, 5) == "list(") return("list") + if (a == "NULL") return("NULL") + ## detection of character arguments is not easy and should not be considered + ## fool proof since user may pass e.g. a vector of character strings as a + ## symbol, which can only really be interpreted as an expression + if (sum(grep('\\"', a)) && length(setdiff(af, "c")) == 0) return("character") + + if (is.data.frame(data)) { + if (is.symbol(arg) && a %in% names(data)) return("expression") + if (length(av) == 1L && av %in% names(data)) return("expression") + e <- eval(arg, envir = data[1:min(nrow(data), 20L), ], + enclos = if (is.environment(enclos)) enclos else parent.frame(n + 1L)) + if (inherits(e, "formula")) return("formula") + if (is.null(e)) return("NULL") + if (is.list(e)) return("list") + if (is.character(e) && all(e %in% names(data))) return("character") + if (is.vector(e) || is.factor(e)) return("expression") + + if (recursive && is.language(e)) return(popArgType(e, data = data, n = n + 1L, enclos = enclos)) + } + + "expression" + +} +popArg2ModelNames <- function(arg, type) { + ## INTENTION: given a quoted/substituted expression, + ## digs out the expression(s) creating a/multiple column(s) + ## and returns the deparsed expression(s) to be used as names + ## of columns the same way that models such as lm() display + ## the names of expressions used within formula + + ## some exceptions + if (is.data.frame(arg)) return(names(arg)) + if (is.character(arg)) return(arg) + + type <- match.arg(type[1L], c("NULL", "character", "list", "expression", "formula")) + + lang <- NULL + lang <- try(is.language(arg) || inherits(arg, "formula"), silent = TRUE) + + + if (inherits(lang, "try-error") || !lang) stop("arg must be a quoted or substituted expression or a formula. Error message: ", lang, ". type of arg: ", typeof(arg), ". Class: ", class(arg), ". Mode: ", mode(arg), ".") + + d <- oneWhitespace(paste0(deparse(arg))) + + if (type == "expression") return(d) else + if (type == "NULL") return(NULL) else + if (type == "character") return(eval(arg)) else + if (type == "list") { + d <- substr(d, 6, nchar(d)-1L) ## removes "list(" and ")" + d <- strsplit(d, ", ") + return(unlist(d)) + } else if (type == "formula") { + arg <- eval(arg) + d <- names(RHS2list(arg)) + if (length(d) == 0L) return(NULL) ## e.g. y ~ 1 + return(d) + } + stop("could not determine deparsed-expression-names") +} + + + + + +uses_dollar <- function(q, data.names) { + ## INTENTION: determine whether q is an expressions that is evaluated + ## outside a data.frame, i.e. one that uses the dollar operator. + ## e.g. TF$V1 should not be evaluated in a data.frame even if it has + ## the variables TF and V1 since it wont work and was not intended. + if (!is.language(q) || inherits(q, "formula")) { + return(FALSE) + } + + d <- deparse(q) + ## sometimes d is of length > 1 for some reason... + d <- paste0(d, collapse = "") + d <- oneWhitespace(d) + + if (substr(d, 1, 4) == "list") { + ## lists are not allowed to work in this manner for now. + return(FALSE) + } + + if (!grepl(x = d, pattern = "\\$")) { + ## does not use dollar operator. + return(FALSE) + } + + ## detect if word$word is used in d + t <- regexec(pattern = "\\w+\\$\\w+", text = d) + if (t != -1) { + ## ok, used word$word + ## is there are variable with that name in data.names? + m <- unlist(regmatches(d, t)) + if (m %in% data.names) { + return(FALSE) + } + ## if not, it should be evaluated outside the data. + return(TRUE) + } + + return(FALSE) +} + + + + +`%.:%` <- function(x, y) { + ## INTENTION: hacking formula calls using `:` + ## which is apparently normally evaluated in C... (in model.matrix.default) + ## USAGE: e.g. c(1,2) %.:% c(3,4) = c(3, 8) + ## (instead of getting warning) + if (length(x) > 1L && length(y) > 1L && is.numeric(x) && is.numeric(y)) { + return(x*y) + } else if (length(x) == 1L && length(y) == 1L && is.numeric(x) && is.numeric(y)) { + return(x:y) + } + as.factor(x):as.factor(y) + +} + + + +RHS2list <- function(formula, handle.adjust=TRUE) { + ## INTENTION: turns the right-hand side of a formula + ## into a list of substituted expressions; + ## each element in list is an expressions separated + ## by a '+' in the formula. needs to be eval()'d, + ## preferably using the appropriate data set. + if (!inherits(formula, "formula")) stop("not a formula") + + ## no response + formula <- formula[c(1, length(formula))] + + te <- terms(formula) + tl <- attr(te, "term.labels") + + ## handle adjusting variables (e.g. adjust(V1, V2) -> c("V1", "V2")) + adj <- tl[substr(tl, 1, 7) == "adjust("] + if (length(adj) == 0L) adj <- NULL + if (handle.adjust && !is.null(adj)) { + tl <- setdiff(tl, adj) + adj <- unlist(lapply(adj, function(stri) { + e <- parse(text = stri)[[1]] + e <- as.list(e)[-1] + unlist(lapply(e, deparse)) + })) + + tl <- c(tl, adj) + } + + ## to avoid e.g. c(1,2):c(3,4) NOT evaluating as c(3, 8) + l <- lapply(tl, function(x) gsub(pattern = ":", x = x, replacement = "%.:%")) + l <- lapply(l, function(x) parse(text = x)[[1L]]) + + names(l) <- tl + + setattr(l, "adjust", adj) + + l +} + +uses_colon <- function(e) { + if (is.character(e)) e <- parse(text = e)[[1]] + stopifnot(is.call(e) || is.name(e)) + stopifnot(!inherits(e, "formula")) + + l <- as.list(e) + if (deparse(l[[1]]) %in% c(":", "%.:%") && length(l) == 3) { + return(TRUE) + } + FALSE +} + +replace_colon <- function(e) { + stopifnot(is.call(e) || is.name(e)) + stopifnot(!inherits(e, "formula")) + + if (!uses_colon(e)) return(e) + + l <- as.list(e) + l[[1]] <- parse(text = "popEpi:::`%.:%`")[[1]] + as.call(l) +} + +RHS2DT <- function(formula, data = data.frame(), enclos = parent.frame(1L)) { + l <- RHS2list(formula) + if (length(l) == 0L) return(data.table()) + adj <- attr(l, "adjust") + + ## foolproofing in case data contains column named e.g. "a:b" and that is + ## intended to be used + dana <- names(data) + dana <- gsub(x=dana, pattern=" %.:% ", replacement = ":") + dana <- gsub(x=dana, pattern="%.:%", replacement = ":") + + + ld <- lapply(l, deparse) + ld <- lapply(ld, function(ch) { + ch <- gsub(x=ch, pattern=" %.:% ", replacement = ":") + ch <- gsub(x=ch, pattern="%.:%", replacement = ":") + int <- if (ch %in% dana) which(dana %in% ch) else ch + if (is.integer(int)) data[[names(data)[int]]] else NULL + }) + ld[which(sapply(ld, is.null))] <- NULL + l[names(ld)] <- ld + + ## foolproofs use of function %.:% by explicit referral, i.e. ::: + ## (this avoids scoping problem) + l <- lapply(l, function(elem) { + if (!is.call(elem)) return(elem) + if (!uses_colon(elem)) return(elem) + + replace_colon(elem) + + }) + + l <- lapply(l, function(elem) { + eval(expr = elem, envir = data, enclos = enclos) + }) + + l <- as.data.table(l) + setattr(l, "adjust", adj) + l +} + + + + + +Surv2DT <- function(Surv) { + sa <- attributes(Surv) + type <- sa$type + + dt <- as.data.table(lapply(sa$dimnames[[2]], function(col_nm) { + Surv[, col_nm] + })) + setnames(dt, names(dt), sa$dimnames[[2]]) + + statNA <- sum(is.na(dt$status)) + if (statNA) { + stop("Some status indicators (", statNA ," values in total) were NA. ", + "Usual suspects: original status variable ", + "has NA values, or you have numeric status variable with more than ", + "two levels and you did not assign e.g. type = 'mstate' (e.g. ", + "Surv(time = c(1,1,1), event = c(0,1,2), type = 'mstate') works).") + + } + + + setattr(dt, "type", type) + + label_sources <- c("time2", "event") + lapply(label_sources, function(lbl_src) { + if (identical(sa$inputAttributes[[lbl_src]]$class, "factor")) { + set( + dt, j = "status", + value = factor(dt$status, labels = sa$inputAttributes[[lbl_src]]$levels) + ) + } + NULL + }) + + dt[] +} + + + + + +evalPopFormula <- function(formula, data = data.frame(), enclos = parent.frame(2L), subset = NULL, Surv.response = TRUE) { + + ## INTENTION: given a formula object, returns a DT where each column + ## is an evaluated expression from formula (separated by + ) + + fe <- environment(formula) + + either <- FALSE + if (is.character(Surv.response)) { + Surv.response <- match.arg(Surv.response, "either") + Surv.response <- TRUE + either <- TRUE + } else if (!is.logical(Surv.response)) { + stop("Surv.response must be either logical or 'either'") + } + + ## subset if needed ---------------------------------------------------------- + if (!is.null(subset) && !is.logical(subset)) { + stop("subset must be NULL or a logical vector and not an expression at ", + "this point. If you see this, complain to the package maintainer.") + } + if (!is.null(subset)) { + keepVars <- c(all.vars(formula), "lex.Xst") + data <- subsetDTorDF(data, subset = subset, select = keepVars) + } + + + ## formula ------------------------------------------------------------------- + if (!inherits(formula, "formula")) { + stop("formula is not of class 'formula'; supply it as e.g. y ~ x") + } + if (length(formula) < 3L) { + stop("formula appears to be one-sided, which is not supported; ", + "supply it as e.g. y ~ x") + } + + ## response + y <- eval(formula[[2L]], envir = data, enclos = enclos) + if (inherits(y, "Surv") && !either && !Surv.response) { + stop("Response is a result of using Surv(), which is not allowed in ", + "this context.") + } + + if (!inherits(y, "Surv") && !either && Surv.response) { + stop("The response of the formula must be a Surv object; ", + "see ?Surv (in package survival).") + } + + if (inherits(y, "Surv")) { + y <- Surv2DT(y) + setcolsnull(y, keep = c("time", "start", "status"), colorder = TRUE) + if (!any(c("time", "start") %in% names(y))) { + stop("You must supply function Surv a value to the 'time' ", + "argument. See ?Surv") + } + setnames(y, names(y), c("time", "status")[1:ncol(y)]) + } else { + y <- data.table(y) + setnames(y, 1, deparse(formula[[2L]])) + if (either && inherits(data, "Lexis")) { + ## we assume the unmodified lex.Xst to be a useful status variable. + if (!"lex.Xst" %in% names(data)) { + stop("Supplied a formula without using Surv(), and data was a Lexis ", + "object, so assumed you intended to use 'lex.Xst' in data as the ", + "status variable in this context, but that column was missing ", + "from data.") + } + setnames(y, 1, "time") + y[, "status"] <- data$lex.Xst + + } + } + + + ## RHS + l <- RHS2DT(formula, data = data, enclos = enclos) + adj <- attr(l, "adjust") + + ## combine + l <- if (length(l) > 0L) cbind(y, l) else y + + setattr(l, "adjust.names", adj) + setattr(l, "print.names", setdiff(names(l), c(adj, names(y)))) + setattr(l, "Surv.names", names(y)) + setattr(l, "formula", formula) + + l +} + + +evalRecursive <- function(arg, env, enc, max.n = 100L) { + ## INTENTION: digs out actual evaluatable value and expression + + if (missing(env)) env <- environment() + if (missing(enc)) enc <- parent.frame(1L) + + if (is.data.frame(env)) { + na <- names(env) + env <- env[1:(min(10L, nrow(env))), ] + + env <- data.frame(env) + setattr(env, "names", na) + + } + argSub <- arg + + tick <- 1L + while (!inherits(arg, "try-error") && is.language(arg) && + !inherits(arg, "formula") && tick < max.n) { + + argSub <- arg + arg <- try(eval(argSub, envir = env, enclos = enc), silent = TRUE) + + tick <- tick + 1L + } + + if (tick == max.n) { + stop("evaluated expression ", max.n, + " times and still could not find underlying expression") + } + if (!is.language(argSub)) argSub <- substitute(arg) + list(arg = arg, argSub = argSub, all.vars = all.vars(argSub)) +} + + +usePopFormula <- function(form = NULL, adjust = NULL, data = data.frame(), + enclos, Surv.response = TRUE) { + ## INTENTION: evaluates form and combines with adjust appropriately + ## returns a list of the elements dug out from the formula and adjust + ## arguments. + # formSub <- substitute(form) + al <- evalRecursive(arg = form, env = data, enc = enclos) + + if (!inherits(al$arg, "formula")) stop("'form' is not a formula object") + + dt <- evalPopFormula(formula = al$arg, data = data, enclos = enclos, + Surv.response = Surv.response) + adNames <- attr(dt, "adjust.names") + prNames <- attr(dt, "print.names") + suNames <- attr(dt, "Surv.names") + + adjust <- evalPopArg(data, adjust, DT = TRUE, recursive = TRUE, + enclos = new.env(), naming = "model", + types = c("NULL", "character", "list", "expression")) + + if (is.data.frame(adjust) && (nrow(adjust) == 0L || ncol(adjust) == 0L)) { + stop("adjust evaluated to an empty data.frame") + } + if (!is.null(adjust) && ncol(adjust) > 0L && length(adNames) > 0L) { + stop("Cannot both use argument 'adjust' AND use an adjust() term within ", + "the formula argument. Please only use one.") + } + if (is.null(adjust) && length(adNames) > 0L) { + adjust <- dt[, .SD, .SDcols = c(adNames)] + } + + + print <- NULL + if (length(prNames > 0L)) print <- dt[, .SD, .SDcols = eval(prNames)] + + list(y = dt[, .SD, .SDcols = c(suNames)], + print = print, + adjust = adjust, formula = al$arg) +} + + + + + +#' @title Adjust Estimates by Categorical Variables +#' @description This function is only intended to be used within a formula +#' when supplied to e.g. \code{\link{survtab_ag}} and should not be +#' used elsewhere. +#' @param ... variables to adjust by, e.g. \code{adjust(factor(v1), v2, v3)} +#' @return Returns a list of promises of the variables supplied which can be +#' evaluated. +#' @examples +#' +#' y ~ x + adjust(z) +#' @export +adjust <- function(...) { + + call <- sys.call(1L) + call <- as.list(call)[1L] + + if (deparse(call) %in% c("adjust", "list(adjust)")) { + stop("Function adjust() only intended to be used within the formulas of ", + "certain functions of package popEpi. See e.g. ?survtab_ag for usage.") + } + + mc <- as.list(match.call())[-1L] + if (is.list(mc) && length(mc) == 1) mc <- mc[[1]] + mc +} + + + + + +parse_adjust_formula <- function(f) { + + env <- environment(f) + t <- attr(terms(f), "term.labels") + + l <- sapply(t, grepl, pattern = "adjust(", fixed = TRUE) + + a <- t[l] + + if (!any(l)) return(f) + + f <- deparse(f) + f <- paste0(f, collapse = "") + f <- oneWhitespace(f) + + for (k in seq_along(a)) { + f <- sub(x = f, pattern = paste0(" + ", a[k]), replacement = "", fixed = TRUE) + } + + a <- lapply(a, function(stri) { + e <- parse(text = stri)[[1]] + as.list(e)[-1] + }) + + a <- unlist(a, recursive = FALSE) + + a <- sapply(a, function(e) { + + e <- substitute(adjust(e), list(e = e)) + deparse(e) + + }) + + f <- paste0(f, " + ", paste0(a, collapse = " + ")) + f <- eval(parse(text = f)[[1]]) + environment(f) <- env + f +} + + + + + +model_frame_robust <- function(formula, data, enc) { + + stopifnot(inherits(formula, "formula")) + stopifnot(is.environment(enc)) + + pe <- function(x, ...) { + eval(parse(text = x), ...) + } + + fo <- formula + + te <- terms(fo) + tl <- attr(te, "term.labels") + + av <- all.vars(fo) + av <- intersect(av, names(data)) + + ## non-interactions as they are + l <- lapply(tl, function(stri) { + if (stri %in% names(data)) return(data[[stri]]) + e <- try(pe(stri, envir = data, enclos = enc), silent = TRUE) + if (inherits(e, "try-error")) e <- stri + e + }) + + fa <- attr(te, "factors") + + whInter <- colnames(fa)[colSums(fa) > 1] + + whInter <- which(sapply(l, function(elem) { + is.character(elem) && length(elem) == 1L && elem %in% whInter + })) + whVar <- setdiff(seq_along(l), whInter) + + if (sum(whInter)) { + + fa <- fa[, whInter, drop = FALSE] + interList <- lapply(colnames(fa), function(stri) { + + e <- parse(text = stri)[[1]] + e <- as.list(e)[-1] + e <- sapply(e, deparse) + + }) + + names(interList) <- colnames(fa) + + interVars <- unique(unlist(interList)) + + + interData <- lapply(interVars, function(stri) { + if (stri %in% names(data)) return(data[[stri]]) + eval(parse(text = stri), envir = data, enclos = enc) + }) + + interData <- setDT(lapply(interData, as.factor)) + names(interData) <- interVars + + on <- copy(names(interData)) + tn <- paste0("V", seq_along(interData)) + names(tn) <- on + names(interData) <- tn + + til <- lapply(interList, function(stri) { + tn[stri] + }) + names(til) <- sapply(til, paste0, collapse = ":") + + + + l[whInter] <- lapply(seq_along(til), function(i) { + + tmpExpr <- til[[i]] + realExpr <- l[whInter][[i]] + e <- try(pe(tmpExpr, envir = interData), silent = TRUE) + + if (inherits(e, "try-error")) { + e <- pe(realExpr, envir = data, enclos = enc) + } + e + }) + + } + + names(l) <- tl + l <- as.data.table(l) + l +} + + + + + +dt_robust_by <- function(e, by.var.nms) { + stopifnot( + length(e) == 1, + is.character(e), + grepl(x = e, pattern = "by\\s{0,}=\\s{0,}%%BY_VAR_NMS%%"), + length(by.var.nms) == 0 || is.character(by.var.nms) + ) + + le <- paste0("list(", paste0("`", by.var.nms, "`", collapse = ", "), ")") + if (!length(by.var.nms)) le <- "NULL" + + e <- gsub(x = e, pattern = "%%BY_VAR_NMS%%", fixed = TRUE, replacement = le) + eval(parse(text = e), envir = parent.frame(1L)) + NULL +} + + + + + + + + + diff --git a/R/flexyargs.R b/R/flexyargs.R index bdd65b7..3ef315a 100644 --- a/R/flexyargs.R +++ b/R/flexyargs.R @@ -1,166 +1,166 @@ - - - -#' @title Flexible Variable Usage in \pkg{popEpi} Functions -#' @author Joonas Miettinen -#' @name flexible_argument -#' @description Certain arguments in \pkg{popEpi} can be passed in multiple -#' ways. This document shows the usage and a pitfall in the -#' usage of such flexible arguments. -#' -#' @details -#' -#' Flexible arguments in \pkg{popEpi} are used to pass variables existing -#' in your data or in the environment where the function is used -#' (for everyday users this is the global environment - in simple terms, -#' where your data is / your work space). The flexible arguments -#' are modelled after the \code{by} argument in \code{data.tables} - -#' see \code{?data.table}. There are many ways to supply the same information -#' to certain functions in \pkg{popEpi}, but the possible ways listed below -#' may be limited in some of them to only allow for using only a part of them. -#' -#' @section Everyday usage: -#' -#' Most commonly you may pass -#' variable names as character strings, e.g. -#' -#' \code{FUN(arg = c("V1", "V2"), data = x)} -#' -#' which may be stored in advance: -#' -#' \code{vars <- c("V1", "V2")} -#' -#' \code{FUN(arg = vars, data = x)} -#' -#' where \code{x} contains those variables. You may also supply variable -#' names as symbols: -#' -#' \code{FUN(arg = V1, data = x)} -#' -#' Or as a list of symbols (similarly to as in \code{\link{aggregate}}): -#' -#' \code{FUN(arg = list(V1, V2), data = x)} -#' -#' Or as a list of expressions: -#' -#' \code{FUN(arg = list(V1 + 1, factor(V2)), data = x)} -#' -#' A formula without a left-hand-side specified is sometimes allowed as well: -#' -#' \code{FUN(arg = ~ I(V1 + 1) + factor(V2), data = x)} -#' -#' Using a symbol or a list of symbols/expressions typically -#' causes the function to look for the variable(s) -#' first in the supplied data (if any) and then where the function was called. -#' For everyday users this means you might define e.g. -#' -#' \code{V3 <- factor(letters)} -#' -#' and do e.g. -#' -#' \code{FUN(arg = list(V1 + 1, factor(V2), V3), data = x)} -#' -#' provided \code{V1} and \code{V2} exist in \code{x} or in the function calling -#' environment. -#' -#' @section A pitfall: -#' -#' There is one way to use flexible arguments incorrectly: By supplying -#' the name of a variable which exists both in the supplied data -#' and the calling environment, and intending the latter to be used. E.g. -#' -#' \code{vars <- c("V2")} -#' -#' \code{FUN(arg = V3, data = x)} -#' -#' where \code{x} has a column named \code{vars}. This causes the function to -#' use \code{x$vars} and NOT \code{x$V2}. -#' -#' @section Advanced: -#' -#' Function programmers are advised to pass character strings -#' whenever possible. To fool-proof against conflicts as described in the -#' section above, refer to the calling environment explicitly when -#' passing the variable containing the character strings: -#' -#' \code{TF <- environment() ## current env to refer to} -#' -#' \code{vars <- c("V1", "V2")} -#' -#' \code{FUN(arg = TF$vars, data = x)} -#' -#' Even if \code{x} has columns named \code{vars} and \code{TF}, -#' using \code{TF$vars} does not use those columns but only evaluates -#' \code{TF$vars} -#' in the calling environment. This is made possible by the fact -#' that data is always passed as a \code{data.frame}, within which evaluation -#' of expressions using the dollar operator is not possible. Therefore -#' it is safe to assume the data should not be used. However, lists of -#' expressions will not be checked for dollar use and will fail in conflict -#' situations: -#' -#' \code{TF <- environment() ## current env to refer to} -#' -#' \code{vars <- letters[1:5]} -#' -#' \code{x <- data.frame(vars = 1:5, TF = 5:1, V1 = 10:6)} -#' -#' \code{FUN(arg = list(TF$vars, V1), data = x)} -#' -#' On the other hand you may typically also pass quoted (\code{\link{quote}}) -#' or substituted \code{\link{substitute}} expressions etc., where -#' the \code{env$object} trick will work as well: -#' -#' \code{q <- quote(list(vars, V1))} -#' -#' \code{FUN(arg = TF$q, data = x)} -#' -#' This works even with -#' -#' \code{a <- 1:5} -#' -#' \code{V1 <- quote(TF$a)} -#' -#' \code{FUN(arg = TF$V1, data = x)} -#' -#' So no conflicts should occur. -#' @family popEpi_argument -#' @examples -#' -#' data(sire) -#' ## prepare data for e.g. 5-year "period analysis" for 2008-2012 -#' ## note: sire is a simulated cohort integrated into popEpi. -#' BL <- list(fot=seq(0, 5, by = 1/12)) -#' x <- lexpand(sire, birth = bi_date, entry = dg_date, exit = ex_date, -#' status = status %in% 1:2, -#' breaks = BL) -#' -#' x <- aggre(x, by = fot) -#' -#' ## silly example of referring to pyrs data by fixed character string; -#' ## its possible that the real name wont be fixed in a real-life application. -#' pyrs <- "actual_pyrs" -#' TF <- environment() -#' x$actual_pyrs <- as.numeric(x$pyrs) -#' x$pyrs <- 1 -#' -#' ## this works (uses actual_pyrs eventually) -#' st <- survtab_ag(fot ~ 1, data = x, surv.type = "surv.obs", -#' pyrs = TF$pyrs, d = from0to1, -#' surv.method = "hazard") -#' ## this would be wrong (sees expression 'pyrs' and uses that column, -#' ## which is not what is intended here) -#' st <- survtab_ag(fot ~ 1, data = x, surv.type = "surv.obs", -#' pyrs = pyrs, d = from0to1, -#' surv.method = "hazard") - -NULL - - - - - - - - - + + + +#' @title Flexible Variable Usage in \pkg{popEpi} Functions +#' @author Joonas Miettinen +#' @name flexible_argument +#' @description Certain arguments in \pkg{popEpi} can be passed in multiple +#' ways. This document shows the usage and a pitfall in the +#' usage of such flexible arguments. +#' +#' @details +#' +#' Flexible arguments in \pkg{popEpi} are used to pass variables existing +#' in your data or in the environment where the function is used +#' (for everyday users this is the global environment - in simple terms, +#' where your data is / your work space). The flexible arguments +#' are modelled after the \code{by} argument in \code{data.tables} - +#' see \code{?data.table}. There are many ways to supply the same information +#' to certain functions in \pkg{popEpi}, but the possible ways listed below +#' may be limited in some of them to only allow for using only a part of them. +#' +#' @section Everyday usage: +#' +#' Most commonly you may pass +#' variable names as character strings, e.g. +#' +#' \code{FUN(arg = c("V1", "V2"), data = x)} +#' +#' which may be stored in advance: +#' +#' \code{vars <- c("V1", "V2")} +#' +#' \code{FUN(arg = vars, data = x)} +#' +#' where \code{x} contains those variables. You may also supply variable +#' names as symbols: +#' +#' \code{FUN(arg = V1, data = x)} +#' +#' Or as a list of symbols (similarly to as in \code{\link{aggregate}}): +#' +#' \code{FUN(arg = list(V1, V2), data = x)} +#' +#' Or as a list of expressions: +#' +#' \code{FUN(arg = list(V1 + 1, factor(V2)), data = x)} +#' +#' A formula without a left-hand-side specified is sometimes allowed as well: +#' +#' \code{FUN(arg = ~ I(V1 + 1) + factor(V2), data = x)} +#' +#' Using a symbol or a list of symbols/expressions typically +#' causes the function to look for the variable(s) +#' first in the supplied data (if any) and then where the function was called. +#' For everyday users this means you might define e.g. +#' +#' \code{V3 <- factor(letters)} +#' +#' and do e.g. +#' +#' \code{FUN(arg = list(V1 + 1, factor(V2), V3), data = x)} +#' +#' provided \code{V1} and \code{V2} exist in \code{x} or in the function calling +#' environment. +#' +#' @section A pitfall: +#' +#' There is one way to use flexible arguments incorrectly: By supplying +#' the name of a variable which exists both in the supplied data +#' and the calling environment, and intending the latter to be used. E.g. +#' +#' \code{vars <- c("V2")} +#' +#' \code{FUN(arg = V3, data = x)} +#' +#' where \code{x} has a column named \code{vars}. This causes the function to +#' use \code{x$vars} and NOT \code{x$V2}. +#' +#' @section Advanced: +#' +#' Function programmers are advised to pass character strings +#' whenever possible. To fool-proof against conflicts as described in the +#' section above, refer to the calling environment explicitly when +#' passing the variable containing the character strings: +#' +#' \code{TF <- environment() ## current env to refer to} +#' +#' \code{vars <- c("V1", "V2")} +#' +#' \code{FUN(arg = TF$vars, data = x)} +#' +#' Even if \code{x} has columns named \code{vars} and \code{TF}, +#' using \code{TF$vars} does not use those columns but only evaluates +#' \code{TF$vars} +#' in the calling environment. This is made possible by the fact +#' that data is always passed as a \code{data.frame}, within which evaluation +#' of expressions using the dollar operator is not possible. Therefore +#' it is safe to assume the data should not be used. However, lists of +#' expressions will not be checked for dollar use and will fail in conflict +#' situations: +#' +#' \code{TF <- environment() ## current env to refer to} +#' +#' \code{vars <- letters[1:5]} +#' +#' \code{x <- data.frame(vars = 1:5, TF = 5:1, V1 = 10:6)} +#' +#' \code{FUN(arg = list(TF$vars, V1), data = x)} +#' +#' On the other hand you may typically also pass quoted (\code{\link{quote}}) +#' or substituted \code{\link{substitute}} expressions etc., where +#' the \code{env$object} trick will work as well: +#' +#' \code{q <- quote(list(vars, V1))} +#' +#' \code{FUN(arg = TF$q, data = x)} +#' +#' This works even with +#' +#' \code{a <- 1:5} +#' +#' \code{V1 <- quote(TF$a)} +#' +#' \code{FUN(arg = TF$V1, data = x)} +#' +#' So no conflicts should occur. +#' @family popEpi argument evaluation docs +#' @examples +#' +#' data(sire) +#' ## prepare data for e.g. 5-year "period analysis" for 2008-2012 +#' ## note: sire is a simulated cohort integrated into popEpi. +#' BL <- list(fot=seq(0, 5, by = 1/12)) +#' x <- lexpand(sire, birth = bi_date, entry = dg_date, exit = ex_date, +#' status = status %in% 1:2, +#' breaks = BL) +#' +#' x <- aggre(x, by = fot) +#' +#' ## silly example of referring to pyrs data by fixed character string; +#' ## its possible that the real name wont be fixed in a real-life application. +#' pyrs <- "actual_pyrs" +#' TF <- environment() +#' x$actual_pyrs <- as.numeric(x$pyrs) +#' x$pyrs <- 1 +#' +#' ## this works (uses actual_pyrs eventually) +#' st <- survtab_ag(fot ~ 1, data = x, surv.type = "surv.obs", +#' pyrs = TF$pyrs, d = from0to1, +#' surv.method = "hazard") +#' ## this would be wrong (sees expression 'pyrs' and uses that column, +#' ## which is not what is intended here) +#' st <- survtab_ag(fot ~ 1, data = x, surv.type = "surv.obs", +#' pyrs = pyrs, d = from0to1, +#' surv.method = "hazard") + +NULL + + + + + + + + + diff --git a/R/fractional_years.R b/R/fractional_years.R index e7f3bd3..de69209 100644 --- a/R/fractional_years.R +++ b/R/fractional_years.R @@ -1,150 +1,150 @@ - - -#' @title Convert date objects to fractional years -#' @author Joonas Miettinen -#' @description Using Date objects, calculates given -#' dates as fractional years. -#' @param x a \code{Date} object, or anything that \code{link{as.Date}} -#' accepts -#' @param year.length character string, either \code{'actual'} or -#' \code{'approx'}; can be abbreviated; see Details -#' @param ... additional arguments passed on to \code{\link{as.Date}}; -#' typically \code{format} when \code{x} is a character string variable, -#' and \code{origin} when \code{x} is numeric -#' @import data.table -#' @export -#' @details -#' -#' \code{x} should preferably be a \code{date}, \code{Date} or \code{IDate} -#' object, although it can also be a character string variable -#' which is coerced internally to \code{Date} format -#' using \code{\link{as.Date.character}}. -#' -#' When \code{ year.length = 'actual' }, fractional years are calculated as -#' \code{ year + (day_in_year-1)/365 } for non-leap-years -#' and as \code{ year + (day_in_year-1)/366 } for leap years. -#' If \code{ year.length = 'approx' }, fractional years are always -#' calculated as in \code{ year + (day_in_year-1)/365.242199 }. -#' -#' There is a slight difference, then, between the two methods -#' when calculating durations between fractional years. For -#' meticulous accuracy one might instead want to calculate durations using -#' dates (days) and convert the results to fractional years. -#' -#' Note that dates are effectively converted to fractional years at -#' \code{ 00:00:01 } o'clock: -#' -#' -#' \code{ get.yrs("2000-01-01") = 2000 }, and -#' \code{ get.yrs("2000-01-02") = 2000 + 1/365.242199 }. -#' -#' -#' @seealso -#' \code{\link[Epi]{cal.yr}}, \code{\link{as.Date.yrs}}, \code{\link{as.Date}} -#' -#' @examples -#' -#' data("sire") -#' sire$dg_yrs <- get.yrs(sire$dg_date) -#' summary(sire$dg_yrs) -#' -#' ## see: ?as.Date.yrs -#' dg_date2 <- as.Date(sire$dg_yrs) -#' summary(as.numeric(dg_date2 - sire$dg_date)) -#' -#' ## Epi's cal.yr versus get.yrs -#' d <- as.Date("2000-01-01") -#' Epi::cal.yr(d) ## 1999.999 -#' get.yrs(d) ## 2000 -#' -#' ## "..." passed on to as.Date, so character / numeric also accepted as input -#' ## (and whatever else as.Date accepts) -#' get.yrs("2000-06-01") -#' get.yrs("20000601", format = "%Y%m%d") -#' get.yrs("1/6/00", format = "%d/%m/%y") -#' -#' get.yrs(100, origin = "1970-01-01") -#' -#' -get.yrs <- function(x, year.length = "approx", ...) { - as.yrs(x, year.length = year.length, ...) -} - - -as.yrs <- function(x, year.length, ...) { - UseMethod("as.yrs") -} - -as.yrs.Date <- function(x, year.length = "approx", ...) { - year.length <- match.arg(year.length, c("actual", "approx")) - - yl <- 365.242199 - y <- year(x) - if (year.length == "actual") { - yl <- ifelse(is_leap_year(y), 366L, 365L) - } - d <- yday(x) - - yrs <- y + (d - 1L)/yl - setattr(yrs, "year.length", year.length) - setattr(yrs, "class", c("yrs", "numeric")) - yrs -} - -as.yrs.default <- function(x, year.length = "approx", ...) { - - x <- as.Date(x, ...) - as.yrs(x, year.length = year.length) - -} - - - -#' @title Coerce Fractional Year Values to Date Values -#' @author Joonas Miettinen -#' @param x an \code{yrs} object created by \code{get.yrs} -#' @param ... unused, included for compatibility with other \code{as.Date} -#' methods -#' @description Coerces an \code{yrs} object to a \code{Date} object. -#' Some loss of information comes if \code{year.length = "approx"} -#' was set when using \code{\link{get.yrs}}, so the transformation back -#' to \code{Date} will not be perfect there. With \code{year.length = "actual"} -#' the original values are perfectly retrieved. -#' @examples -#' data("sire", package = "popEpi") -#' -#' ## approximate year lengths: here 20 % have an extra day added -#' sire$dg_yrs <- get.yrs(sire$dg_date) -#' summary(sire$dg_yrs) -#' dg_date2 <- as.Date(sire$dg_yrs) -#' summary(as.numeric(dg_date2 - sire$dg_date)) -#' -#' ## using actual year lengths -#' sire$dg_yrs <- get.yrs(sire$dg_date, year.length = "actual") -#' summary(sire$dg_yrs) -#' dg_date2 <- as.Date(sire$dg_yrs) -#' summary(as.numeric(dg_date2 - sire$dg_date)) -#' @seealso \code{\link{get.yrs}} -#' @export -as.Date.yrs <- function(x, ...) { - - yl <- attr(x, "year.length") - if (is.null(yl)) { - warning("x did not contain meta information about year length used ", - "when forming the yrs object. Assuming 'approx'.") - yl <- "approx" - } - - y <- as.integer(x) - - mu <- 365.242199 - if (yl == "actual") { - mu <- ifelse(is_leap_year(y), rep(365L, length(x)), rep(364L, length(x))) - } - x <- x + 1L/mu - yd <- as.integer((x-y)*mu) - d <- as.Date(paste0(y, "-01-01")) + yd - d -} - - + + +#' @title Convert date objects to fractional years +#' @author Joonas Miettinen +#' @description Using Date objects, calculates given +#' dates as fractional years. +#' @param x a \code{Date} object, or anything that \code{link{as.Date}} +#' accepts +#' @param year.length character string, either \code{'actual'} or +#' \code{'approx'}; can be abbreviated; see Details +#' @param ... additional arguments passed on to \code{\link{as.Date}}; +#' typically \code{format} when \code{x} is a character string variable, +#' and \code{origin} when \code{x} is numeric +#' @import data.table +#' @export +#' @details +#' +#' \code{x} should preferably be a \code{date}, \code{Date} or \code{IDate} +#' object, although it can also be a character string variable +#' which is coerced internally to \code{Date} format +#' using \code{\link{as.Date.character}}. +#' +#' When \code{ year.length = 'actual' }, fractional years are calculated as +#' \code{ year + (day_in_year-1)/365 } for non-leap-years +#' and as \code{ year + (day_in_year-1)/366 } for leap years. +#' If \code{ year.length = 'approx' }, fractional years are always +#' calculated as in \code{ year + (day_in_year-1)/365.242199 }. +#' +#' There is a slight difference, then, between the two methods +#' when calculating durations between fractional years. For +#' meticulous accuracy one might instead want to calculate durations using +#' dates (days) and convert the results to fractional years. +#' +#' Note that dates are effectively converted to fractional years at +#' \code{ 00:00:01 } o'clock: +#' +#' +#' \code{ get.yrs("2000-01-01") = 2000 }, and +#' \code{ get.yrs("2000-01-02") = 2000 + 1/365.242199 }. +#' +#' +#' @seealso +#' \code{\link[Epi]{cal.yr}}, \code{\link{as.Date.yrs}}, \code{\link{as.Date}} +#' +#' @examples +#' +#' data("sire") +#' sire$dg_yrs <- get.yrs(sire$dg_date) +#' summary(sire$dg_yrs) +#' +#' ## see: ?as.Date.yrs +#' dg_date2 <- as.Date(sire$dg_yrs) +#' summary(as.numeric(dg_date2 - sire$dg_date)) +#' +#' ## Epi's cal.yr versus get.yrs +#' d <- as.Date("2000-01-01") +#' Epi::cal.yr(d) ## 1999.999 +#' get.yrs(d) ## 2000 +#' +#' ## "..." passed on to as.Date, so character / numeric also accepted as input +#' ## (and whatever else as.Date accepts) +#' get.yrs("2000-06-01") +#' get.yrs("20000601", format = "%Y%m%d") +#' get.yrs("1/6/00", format = "%d/%m/%y") +#' +#' get.yrs(100, origin = "1970-01-01") +#' +#' +get.yrs <- function(x, year.length = "approx", ...) { + as.yrs(x, year.length = year.length, ...) +} + + +as.yrs <- function(x, year.length, ...) { + UseMethod("as.yrs") +} + +as.yrs.Date <- function(x, year.length = "approx", ...) { + year.length <- match.arg(year.length, c("actual", "approx")) + + yl <- 365.242199 + y <- year(x) + if (year.length == "actual") { + yl <- ifelse(is_leap_year(y), 366L, 365L) + } + d <- yday(x) + + yrs <- y + (d - 1L)/yl + setattr(yrs, "year.length", year.length) + setattr(yrs, "class", c("yrs", "numeric")) + yrs +} + +as.yrs.default <- function(x, year.length = "approx", ...) { + + x <- as.Date(x, ...) + as.yrs(x, year.length = year.length) + +} + + + +#' @title Coerce Fractional Year Values to Date Values +#' @author Joonas Miettinen +#' @param x an \code{yrs} object created by \code{get.yrs} +#' @param ... unused, included for compatibility with other \code{as.Date} +#' methods +#' @description Coerces an \code{yrs} object to a \code{Date} object. +#' Some loss of information comes if \code{year.length = "approx"} +#' was set when using \code{\link{get.yrs}}, so the transformation back +#' to \code{Date} will not be perfect there. With \code{year.length = "actual"} +#' the original values are perfectly retrieved. +#' @examples +#' data("sire", package = "popEpi") +#' +#' ## approximate year lengths: here 20 % have an extra day added +#' sire$dg_yrs <- get.yrs(sire$dg_date) +#' summary(sire$dg_yrs) +#' dg_date2 <- as.Date(sire$dg_yrs) +#' summary(as.numeric(dg_date2 - sire$dg_date)) +#' +#' ## using actual year lengths +#' sire$dg_yrs <- get.yrs(sire$dg_date, year.length = "actual") +#' summary(sire$dg_yrs) +#' dg_date2 <- as.Date(sire$dg_yrs) +#' summary(as.numeric(dg_date2 - sire$dg_date)) +#' @seealso \code{\link{get.yrs}} +#' @export +as.Date.yrs <- function(x, ...) { + + yl <- attr(x, "year.length") + if (is.null(yl)) { + warning("x did not contain meta information about year length used ", + "when forming the yrs object. Assuming 'approx'.") + yl <- "approx" + } + + y <- as.integer(x) + + mu <- 365.242199 + if (yl == "actual") { + mu <- ifelse(is_leap_year(y), rep(365L, length(x)), rep(364L, length(x))) + } + x <- x + 1L/mu + yd <- as.integer((x-y)*mu) + d <- as.Date(paste0(y, "-01-01")) + yd + d +} + + diff --git a/R/incidence_rates.R b/R/incidence_rates.R index 5e1f6ff..82c50e2 100644 --- a/R/incidence_rates.R +++ b/R/incidence_rates.R @@ -1,343 +1,346 @@ -#' @title Direct-Standardised Incidence/Mortality Rates -#' @author Matti Rantanen, Joonas Miettinen -#' -#' @description \code{rate} calculates adjusted rates using -#' preloaded weights data or user specified weights. -#' -#' @param data aggregated data (see e.g. \code{\link{lexpand}}, -#' \code{\link{aggre}} if you have subject-level data) -#' @param pyrs person-years variable name in data. -#' \link[=flexible_argument]{Flexible input}, typically e.g. -#' \code{pyrs = pyrs}. -#' @param obs observations variable name in data. -#' \link[=flexible_argument]{Flexible input}, typically e.g. -#' \code{obs = obs}. -#' @param adjust variable for adjusting the rates. -#' \link[=flexible_argument]{Flexible input}, typically e.g. -#' \code{adjust = agegroup}. -#' @param print variable name to stratify the rates. -#' \link[=flexible_argument]{Flexible input}, typically e.g. -#' \code{print = sex} or \code{print = list(sex, area)}. -#' @param weights typically a list of weights or a \code{character} string -#' specifying an age group standardization scheme; see -#' the \link[=direct_standardization]{dedicated help page} -#' and examples. -#' -#' @param subset a logical expression to subset data. -#' -#' @details Input data needs to be in aggregated format with observations -#' and person-years. For individual data use \code{\link{lexpand}}, or -#' \code{\link{ltable}} and merge person-years manually. -#' -#' -#' @return Returns a \code{data.table} with observations, person-years, rates and -#' adjusted rates, if available. Results are stratified by \code{print}. -#' Adjusted rates are identified with suffix \code{.adj} and -#' \code{.lo} and \code{.hi} are for confidence intervals lower and upper -#' 95\% bounds, respectively. -#' The prefix \code{SE.} stands for standard error. -#' -#' @seealso \code{\link{lexpand}}, \code{\link{ltable}} -#' -#' @examples -#' ## Prepare data with lexpand and then reformat agegroup. -#' data(sibr) -#' x <- lexpand(sibr, birth = bi_date, entry = dg_date, exit = ex_date, -#' breaks = list(per = c(1990,2000,2010,2020), age = c(0:17*5,Inf)), -#' aggre = list(agegroup = age, year.cat = per), -#' status = status != 0) -#' -#' x$agegroup <- cut(x$agegroup, c(0:17*5,Inf), right = FALSE) -#' -#' ## calculate rates for selected periods with Nordic 2000 weights: -#' r1 <- rate( data = x, obs = from0to1, pyrs = pyrs, print = year.cat, -#' adjust = agegroup, weights = 'nordic') -#' r1 -#' -#' ## use total person-years by stratum as weights (some have zero) -#' w <- ltable(x, by.vars = "agegroup", expr = sum(pyrs)) -#' w[is.na(w$V1),]$V1 <- 0 -#' -#' r2 <- rate( data = x, obs = from0to1, pyrs = pyrs, print = year.cat, -#' adjust = agegroup, -#' weights = w$V1) -#' r2 -#' -#' ## use data.frame of weights: -#' names(w) <- c("agegroup", "weights") -#' r2 <- rate( data = x, obs = from0to1, pyrs = pyrs, print = year.cat, -#' adjust = agegroup, -#' weights = w) -#' r2 -#' -#' ## internal weights (same result as above) -#' r3 <- rate( data = x, obs = from0to1, pyrs = pyrs, print = year.cat, -#' adjust = agegroup, -#' weights = "internal") -#' r3 -#' -#' @import data.table -#' @export -#' @family main functions -#' @family rate functions - -rate <- function( data, - obs = NULL, - pyrs = NULL, - print = NULL, - adjust = NULL, - weights = NULL, - subset = NULL -) { - - PF <- parent.frame(1L) - TF <- environment() - - ## subsetting ----------------------------------------------------------- - subset <- substitute(subset) - subset <- evalLogicalSubset(data = data, substiset = subset, enclos = PF) - data <- data[subset,] - setDT(data) - - # evalPopArg - obs <- substitute(obs) - inc.obs <- evalPopArg(data = data, arg = obs, enclos = PF) - if (!length(inc.obs)) { - stop("No observations given.") - } - obsNames <- copy(names(inc.obs)) - tmpObsNames <- makeTempVarName(data = data, pre = "obs") - setnames(inc.obs, obsNames, tmpObsNames) - - pyrs <- substitute(pyrs) - inc.pyr <- evalPopArg(data = data, arg = pyrs, enclos = PF) - if (!length(inc.pyr)) { - stop("No pyrs given.") - } - pyrNames <- copy(names(inc.pyr)) - tmpPyrNames <- makeTempVarName(data = data, pre = "pyr") - setnames(inc.pyr, pyrNames, tmpPyrNames) - - print <- substitute(print) - inc.pri <- evalPopArg(data = data, arg = print, enclos = PF) - prNames <- tmpPrNames <- NULL - if (length(inc.pri)) { - prNames <- copy(names(inc.pri)) - tmpPrNames <- makeTempVarName(data = data, - pre = paste0("print", seq_along(prNames))) - setnames(inc.pri, prNames, tmpPrNames) - } - - adjust <- substitute(adjust) - inc.adj <- evalPopArg(data = data, arg = adjust, enclos = PF) - adNames <- tmpAdNames <- NULL - if (length(inc.adj)) { - adNames <- copy(names(inc.adj)) - tmpAdNames <- makeTempVarName(data = data, - pre = paste0("adjust", seq_along(adNames))) - setnames(inc.adj, adNames, tmpAdNames) - } - - ## collect data -------------------------------------------------------------- - data <- cbind(inc.obs, inc.pyr) - if (!is.null(prNames)) data <- cbind(data, inc.pri) - if (!is.null(adNames)) data <- cbind(data, inc.adj) - - - ## handle weights ------------------------------------------------------------ - weights <- substitute(weights) - weights <- eval(weights, envir = PF) - weights <- copy(weights) - if (length(inc.adj)) { - ## rename adjust variables in inc.adj back to original names - ## for more human-readable errors in checkWeights if any occur - setnames(inc.adj, tmpAdNames, adNames) - } - - checkWeights(weights, inc.adj) - if (is.list(weights) && !is.data.frame(weights)) { - ## ensure weights list / DF names match to temp adjust var names - weights <- weights[adNames] - names(weights) <- tmpAdNames - } else if (is.data.frame(weights)) { - setnames(weights, adNames, tmpAdNames) - } - - ## form table with weights --------------------------------------------------- - NA.msg <- "Data contains %%NA_COUNT%% NA values." - data <- makeWeightsDT(data, - values = list(tmpObsNames, tmpPyrNames), - print = tmpPrNames, - adjust = tmpAdNames, - weights = weights, - internal.weights.values = tmpPyrNames, - NA.text = NA.msg) - - ## estimate standardized rates ----------------------------------------------- - data <- rate_est(data = data, - obs = tmpObsNames, - pyrs = tmpPyrNames, - print = tmpPrNames, - weights = "weights") - - ## final touch --------------------------------------------------------------- - setDT(data) - setattr(data, "class", c("rate", "data.table", "data.frame")) - setattr(data, name = 'rate.meta', value = list(obs = obsNames, - pyrs = pyrNames, - weights = weights, - adjust = adNames, - print = prNames, - call = match.call(), - NAs = NA)) - setnames(data, c(tmpObsNames, tmpPyrNames, tmpPrNames), - c(obsNames, pyrNames, prNames)) - - # data.frame output option - if (!return_DT()) { - setDFpe(data) - } - - return(data[]) -} - -#' @export -getCall.rate <- function (x, ...) { - attributes(x)$rate.meta$call -} - -stdr.weights <- function(wp = 'world00_1') { - - ## This one returns the standard population - ## output: data.table with colnames: agegroup, reference - ## standard populations are from datasets: stdpop18 and stdpop101 - allow.pop <- c("world_1966_18of5", - "europe_1976_18of5", - "nordic_2000_18of5", - "world_2000_18of5", - "world_2000_20of5", - "world_2000_101of1") - wp <- match.arg(wp, allow.pop) - - if (length(wp) > 1) { - stop('Standard population name is not a scalar (length != 1).') - - } else if (wp %in% allow.pop[1:3]) { - - # get standard pop - sr <- data.table(popEpi::stdpop18) - setnames(sr, 1:4, c("agegroup",allow.pop[1:3])) - sr[, agegroup := 1:18] - sr[, setdiff(allow.pop[1:3], wp) := NULL] - - setnames(sr, wp, 'reference') - - } else if (wp %in% allow.pop[4:6]) { - - sr <- data.table(popEpi::stdpop101) - if (wp == "world_2000_18of5") { - sr[,agegroup := cut(agegroup, breaks=c(0:17*5,Inf), right=FALSE, labels=FALSE)] - sr <- sr[,list(world_std = sum(world_std)), by="agegroup"] - } - if (wp == 'world_2000_20of5') { - sr[,agegroup := cut(agegroup, breaks=c(0:19*5,Inf), right=FALSE, labels=FALSE)] - sr <- sr[,list(world_std = sum(world_std)), by="agegroup"] - } - else { - sr <- sr[,list(world_std = sum(world_std)), by="agegroup"] - } - setnames(sr, "world_std", "reference") - } - else { - stop("Invalid standard population name.") - } - sr[] -} -globalVariables(c('stdpop18','stdpop101','agegroup','world_std')) - - -rate_est <- function(data = data, - obs = 'obs', - pyrs = 'pyrs', - print = NULL, - weights = NULL -) { - ## This one estimates the rates and calculates CI's and SE's. - - badVars <- paste0("Internal error: missing following variable names in ", - "working data: %%VARS%%. Complain to the pkg maintainer ", - "if you see this.") - all_names_present(data, c(obs, pyrs, print), msg = badVars) - - data <- data.table(data) - if ( is.null(weights) | !weights %in% colnames(data)) { - weights <- NULL - } - - if (all(!is.null(weights), !is.null(obs), !is.null(pyrs))) { - # rate.adj - - f2 <- function(list) list[[1]]/list[[2]]*list[[3]] - funx <- function(n,d,w,fun) eval(parse(text=fun)) - - - # variance rate.adj for each strata A - fun1 <- '(._d_/._n_^2) * ._w_^2' - fun2 <- '._d_ / ._n_ * ._w_' - - make_fun <- function(n = NA, d = NA, w = NA, fun) { - fun <- gsub(pattern = "._n_", replacement = n, x = fun) - fun <- gsub(pattern = "._d_", replacement = d, x = fun) - fun <- gsub(pattern = "._w_", replacement = w, x = fun) - parse(text = fun) - } - eval.me1 <- make_fun(d = obs, n = pyrs, w=weights, fun = fun1) - eval.me2 <- make_fun(d = obs, n = pyrs, w=weights, fun = fun2) - data[, var.temp := eval(eval.me1)] - data[, lam.temp := eval(eval.me2)] - # add std weighted rates and variances - #data[, ':='(var.temp = funx(d=get(obs), n=get(pyrs), w=get(weights), fun = fun1), - # lam.temp = funx(d=get(obs), n=get(pyrs), w=get(weights), fun = fun2)) ] - data[, rate.adj := f2(.SD), .SDcols= c(obs, pyrs, weights)] - - # aggregate data - ie <- paste0('list(', obs, '=sum(',obs,',na.rm=TRUE), ', pyrs, '=sum(',pyrs,',na.rm=TRUE),', - 'rate.adj=sum(rate.adj,na.rm=TRUE),' ,'lam.temp=sum(lam.temp,na.rm=TRUE), var.temp=sum(var.temp,na.rm=TRUE))') - l <- parse(text = ie) - - data <- data[, eval(l), by=print] - # rate.adj: S.E. - data[, SE.log.rate.adj := sqrt((1/lam.temp)^2 * var.temp) ] # tämä on log-rate - data[, SE.rate.adj := sqrt(var.temp)] - # rate.adj: CI - data[, ':='(rate.adj.lo = exp( log(rate.adj) - SE.log.rate.adj*1.96 ), - rate.adj.hi = exp( log(rate.adj) + SE.log.rate.adj*1.96 )) ] - data[,c('lam.temp','var.temp','SE.log.rate.adj') := NULL] - } - - else { - ie <- paste0('list(', obs, '=sum(',obs,'), ', pyrs, '=sum(',pyrs,'))') - l <- parse(text = ie) - data <- data[, eval(l), by=print] - } - # rate - ia <- paste0('rate := ',obs,'/', pyrs) - k <- parse(text = ia) - data[, eval(k), by = print] - - # var(rate) - var_r <- paste0('SE.rate := sqrt(',obs,'/(',pyrs,'*',pyrs,'))') - k <- parse(text = var_r) - data[, eval(k), by = print] - - # var(log(rate)) and CI - eval.me3 <- paste('exp(sqrt(1/',obs,'))') - eval.me3 <- parse(text = eval.me3) - data[, SE.log.rate := eval(eval.me3)] - data[, ':='(rate.lo = exp(log(rate)-log(SE.log.rate)*1.96), - rate.hi = exp(log(rate)+log(SE.log.rate)*1.96)) ] - data[, SE.log.rate := NULL] - return(data[]) -} - -globalVariables(c('var.temp','lam.temp','rate.adj','SE.rate.adj','SE.rate','SE.log.rate','SE.log.rate.adj')) +#' @title Direct-Standardised Incidence/Mortality Rates +#' @author Matti Rantanen, Joonas Miettinen +#' +#' @description \code{rate} calculates adjusted rates using +#' preloaded weights data or user specified weights. +#' +#' @param data aggregated data (see e.g. \code{\link{lexpand}}, +#' \code{\link{aggre}} if you have subject-level data) +#' @param pyrs person-years variable name in data. +#' \link[=flexible_argument]{Flexible input}, typically e.g. +#' \code{pyrs = pyrs}. +#' @param obs observations variable name in data. +#' \link[=flexible_argument]{Flexible input}, typically e.g. +#' \code{obs = obs}. +#' @param adjust variable for adjusting the rates. +#' \link[=flexible_argument]{Flexible input}, typically e.g. +#' \code{adjust = agegroup}. +#' @param print variable name to stratify the rates. +#' \link[=flexible_argument]{Flexible input}, typically e.g. +#' \code{print = sex} or \code{print = list(sex, area)}. +#' @param weights typically a list of weights or a \code{character} string +#' specifying an age group standardization scheme; see +#' the \link[=direct_standardization]{dedicated help page} +#' and examples. +#' +#' @param subset a logical expression to subset data. +#' +#' @details Input data needs to be in aggregated format with observations +#' and person-years. For individual data use \code{\link{lexpand}}, or +#' \code{\link{ltable}} and merge person-years manually. +#' +#' The confidence intervals are based on the normal approximation of the logarithm of the rate. +#' The variance of the log rate that is used to derive the confidence intervals +#' is derived using the delta method. +#' +#' @return Returns a \code{data.table} with observations, person-years, rates and +#' adjusted rates, if available. Results are stratified by \code{print}. +#' Adjusted rates are identified with suffix \code{.adj} and +#' \code{.lo} and \code{.hi} are for confidence intervals lower and upper +#' 95\% bounds, respectively. +#' The prefix \code{SE.} stands for standard error. +#' +#' @seealso \code{\link{lexpand}}, \code{\link{ltable}} +#' +#' @examples +#' ## Prepare data with lexpand and then reformat agegroup. +#' data(sibr) +#' x <- lexpand(sibr, birth = bi_date, entry = dg_date, exit = ex_date, +#' breaks = list(per = c(1990,2000,2010,2020), age = c(0:17*5,Inf)), +#' aggre = list(agegroup = age, year.cat = per), +#' status = status != 0) +#' +#' x$agegroup <- cut(x$agegroup, c(0:17*5,Inf), right = FALSE) +#' +#' ## calculate rates for selected periods with Nordic 2000 weights: +#' r1 <- rate( data = x, obs = from0to1, pyrs = pyrs, print = year.cat, +#' adjust = agegroup, weights = 'nordic') +#' r1 +#' +#' ## use total person-years by stratum as weights (some have zero) +#' w <- ltable(x, by.vars = "agegroup", expr = sum(pyrs)) +#' w[is.na(w$V1),]$V1 <- 0 +#' +#' r2 <- rate( data = x, obs = from0to1, pyrs = pyrs, print = year.cat, +#' adjust = agegroup, +#' weights = w$V1) +#' r2 +#' +#' ## use data.frame of weights: +#' names(w) <- c("agegroup", "weights") +#' r2 <- rate( data = x, obs = from0to1, pyrs = pyrs, print = year.cat, +#' adjust = agegroup, +#' weights = w) +#' r2 +#' +#' ## internal weights (same result as above) +#' r3 <- rate( data = x, obs = from0to1, pyrs = pyrs, print = year.cat, +#' adjust = agegroup, +#' weights = "internal") +#' r3 +#' +#' @import data.table +#' @export +#' @family main functions +#' @family rate functions + +rate <- function( data, + obs = NULL, + pyrs = NULL, + print = NULL, + adjust = NULL, + weights = NULL, + subset = NULL +) { + + PF <- parent.frame(1L) + TF <- environment() + + ## subsetting ----------------------------------------------------------- + subset <- substitute(subset) + subset <- evalLogicalSubset(data = data, substiset = subset, enclos = PF) + data <- data[subset,] + setDT(data) + + # evalPopArg + obs <- substitute(obs) + inc.obs <- evalPopArg(data = data, arg = obs, enclos = PF) + if (!length(inc.obs)) { + stop("No observations given.") + } + obsNames <- copy(names(inc.obs)) + tmpObsNames <- makeTempVarName(data = data, pre = "obs") + setnames(inc.obs, obsNames, tmpObsNames) + + pyrs <- substitute(pyrs) + inc.pyr <- evalPopArg(data = data, arg = pyrs, enclos = PF) + if (!length(inc.pyr)) { + stop("No pyrs given.") + } + pyrNames <- copy(names(inc.pyr)) + tmpPyrNames <- makeTempVarName(data = data, pre = "pyr") + setnames(inc.pyr, pyrNames, tmpPyrNames) + + print <- substitute(print) + inc.pri <- evalPopArg(data = data, arg = print, enclos = PF) + prNames <- tmpPrNames <- NULL + if (length(inc.pri)) { + prNames <- copy(names(inc.pri)) + tmpPrNames <- makeTempVarName(data = data, + pre = paste0("print", seq_along(prNames))) + setnames(inc.pri, prNames, tmpPrNames) + } + + adjust <- substitute(adjust) + inc.adj <- evalPopArg(data = data, arg = adjust, enclos = PF) + adNames <- tmpAdNames <- NULL + if (length(inc.adj)) { + adNames <- copy(names(inc.adj)) + tmpAdNames <- makeTempVarName(data = data, + pre = paste0("adjust", seq_along(adNames))) + setnames(inc.adj, adNames, tmpAdNames) + } + + ## collect data -------------------------------------------------------------- + data <- cbind(inc.obs, inc.pyr) + if (!is.null(prNames)) data <- cbind(data, inc.pri) + if (!is.null(adNames)) data <- cbind(data, inc.adj) + + + ## handle weights ------------------------------------------------------------ + weights <- substitute(weights) + weights <- eval(weights, envir = PF) + weights <- copy(weights) + if (length(inc.adj)) { + ## rename adjust variables in inc.adj back to original names + ## for more human-readable errors in checkWeights if any occur + setnames(inc.adj, tmpAdNames, adNames) + } + + checkWeights(weights, inc.adj) + if (is.list(weights) && !is.data.frame(weights)) { + ## ensure weights list / DF names match to temp adjust var names + weights <- weights[adNames] + names(weights) <- tmpAdNames + } else if (is.data.frame(weights)) { + setnames(weights, adNames, tmpAdNames) + } + + ## form table with weights --------------------------------------------------- + NA.msg <- "Data contains %%NA_COUNT%% NA values." + data <- makeWeightsDT(data, + values = list(tmpObsNames, tmpPyrNames), + print = tmpPrNames, + adjust = tmpAdNames, + weights = weights, + internal.weights.values = tmpPyrNames, + NA.text = NA.msg) + + ## estimate standardized rates ----------------------------------------------- + data <- rate_est(data = data, + obs = tmpObsNames, + pyrs = tmpPyrNames, + print = tmpPrNames, + weights = "weights") + + ## final touch --------------------------------------------------------------- + setDT(data) + setattr(data, "class", c("rate", "data.table", "data.frame")) + setattr(data, name = 'rate.meta', value = list(obs = obsNames, + pyrs = pyrNames, + weights = weights, + adjust = adNames, + print = prNames, + call = match.call(), + NAs = NA)) + setnames(data, c(tmpObsNames, tmpPyrNames, tmpPrNames), + c(obsNames, pyrNames, prNames)) + + # data.frame output option + if (!return_DT()) { + setDFpe(data) + } + + return(data[]) +} + +#' @export +getCall.rate <- function (x, ...) { + attributes(x)$rate.meta$call +} + +stdr.weights <- function(wp = 'world00_1') { + + ## This one returns the standard population + ## output: data.table with colnames: agegroup, reference + ## standard populations are from datasets: stdpop18 and stdpop101 + allow.pop <- c("world_1966_18of5", + "europe_1976_18of5", + "nordic_2000_18of5", + "world_2000_18of5", + "world_2000_20of5", + "world_2000_101of1") + wp <- match.arg(wp, allow.pop) + + if (length(wp) > 1) { + stop('Standard population name is not a scalar (length != 1).') + + } else if (wp %in% allow.pop[1:3]) { + + # get standard pop + sr <- data.table(popEpi::stdpop18) + setnames(sr, 1:4, c("agegroup",allow.pop[1:3])) + sr[, agegroup := 1:18] + sr[, setdiff(allow.pop[1:3], wp) := NULL] + + setnames(sr, wp, 'reference') + + } else if (wp %in% allow.pop[4:6]) { + + sr <- data.table(popEpi::stdpop101) + if (wp == "world_2000_18of5") { + sr[,agegroup := cut(agegroup, breaks=c(0:17*5,Inf), right=FALSE, labels=FALSE)] + sr <- sr[,list(world_std = sum(world_std)), by="agegroup"] + } + if (wp == 'world_2000_20of5') { + sr[,agegroup := cut(agegroup, breaks=c(0:19*5,Inf), right=FALSE, labels=FALSE)] + sr <- sr[,list(world_std = sum(world_std)), by="agegroup"] + } + else { + sr <- sr[,list(world_std = sum(world_std)), by="agegroup"] + } + setnames(sr, "world_std", "reference") + } + else { + stop("Invalid standard population name.") + } + sr[] +} +globalVariables(c('stdpop18','stdpop101','agegroup','world_std')) + + +rate_est <- function(data = data, + obs = 'obs', + pyrs = 'pyrs', + print = NULL, + weights = NULL +) { + ## This one estimates the rates and calculates CI's and SE's. + + badVars <- paste0("Internal error: missing following variable names in ", + "working data: %%VARS%%. Complain to the pkg maintainer ", + "if you see this.") + all_names_present(data, c(obs, pyrs, print), msg = badVars) + + data <- data.table(data) + if ( is.null(weights) | !weights %in% colnames(data)) { + weights <- NULL + } + + if (all(!is.null(weights), !is.null(obs), !is.null(pyrs))) { + # rate.adj + + f2 <- function(list) list[[1]]/list[[2]]*list[[3]] + funx <- function(n,d,w,fun) eval(parse(text=fun)) + + + # variance rate.adj for each strata A + fun1 <- '(._d_/._n_^2) * ._w_^2' + fun2 <- '._d_ / ._n_ * ._w_' + + make_fun <- function(n = NA, d = NA, w = NA, fun) { + fun <- gsub(pattern = "._n_", replacement = n, x = fun) + fun <- gsub(pattern = "._d_", replacement = d, x = fun) + fun <- gsub(pattern = "._w_", replacement = w, x = fun) + parse(text = fun) + } + eval.me1 <- make_fun(d = obs, n = pyrs, w=weights, fun = fun1) + eval.me2 <- make_fun(d = obs, n = pyrs, w=weights, fun = fun2) + data[, var.temp := eval(eval.me1)] + data[, lam.temp := eval(eval.me2)] + # add std weighted rates and variances + #data[, ':='(var.temp = funx(d=get(obs), n=get(pyrs), w=get(weights), fun = fun1), + # lam.temp = funx(d=get(obs), n=get(pyrs), w=get(weights), fun = fun2)) ] + data[, rate.adj := f2(.SD), .SDcols= c(obs, pyrs, weights)] + + # aggregate data + ie <- paste0('list(', obs, '=sum(',obs,',na.rm=TRUE), ', pyrs, '=sum(',pyrs,',na.rm=TRUE),', + 'rate.adj=sum(rate.adj,na.rm=TRUE),' ,'lam.temp=sum(lam.temp,na.rm=TRUE), var.temp=sum(var.temp,na.rm=TRUE))') + l <- parse(text = ie) + + data <- data[, eval(l), by=print] + # rate.adj: S.E. + data[, SE.log.rate.adj := sqrt((1/lam.temp)^2 * var.temp) ] # tämä on log-rate + data[, SE.rate.adj := sqrt(var.temp)] + # rate.adj: CI + data[, ':='(rate.adj.lo = exp( log(rate.adj) - SE.log.rate.adj*1.96 ), + rate.adj.hi = exp( log(rate.adj) + SE.log.rate.adj*1.96 )) ] + data[,c('lam.temp','var.temp','SE.log.rate.adj') := NULL] + } + + else { + ie <- paste0('list(', obs, '=sum(',obs,'), ', pyrs, '=sum(',pyrs,'))') + l <- parse(text = ie) + data <- data[, eval(l), by=print] + } + # rate + ia <- paste0('rate := ',obs,'/', pyrs) + k <- parse(text = ia) + data[, eval(k), by = print] + + # var(rate) + var_r <- paste0('SE.rate := sqrt(',obs,'/(',pyrs,'*',pyrs,'))') + k <- parse(text = var_r) + data[, eval(k), by = print] + + # var(log(rate)) and CI + eval.me3 <- paste('exp(sqrt(1/',obs,'))') + eval.me3 <- parse(text = eval.me3) + data[, SE.log.rate := eval(eval.me3)] + data[, ':='(rate.lo = exp(log(rate)-log(SE.log.rate)*1.96), + rate.hi = exp(log(rate)+log(SE.log.rate)*1.96)) ] + data[, SE.log.rate := NULL] + return(data[]) +} + +globalVariables(c('var.temp','lam.temp','rate.adj','SE.rate.adj','SE.rate','SE.log.rate','SE.log.rate.adj')) diff --git a/R/incidence_rates_utils.R b/R/incidence_rates_utils.R index 491f9fe..92b4a11 100644 --- a/R/incidence_rates_utils.R +++ b/R/incidence_rates_utils.R @@ -1,134 +1,134 @@ - - -#' @title Confidence intervals for the rate ratios -#' @author Matti Rantanen -#' @description Calculate rate ratio with confidence intervals for rate objects or observations and person-years. -#' -#' @details Calculate rate ratio of two age standardized rate objects (see \code{\link{rate}}). -#' Multiple rates for each objects is supported if there are an equal number of rates. -#' Another option is to set \code{x} and \code{y} as a vector of two. -#' \enumerate{ -#' \item rate and its standard error, and set \code{SE.method = TRUE}. -#' \item observations and person-year, and set \code{SE.method = FALSE}. -#' } -#' See examples. -#' -#' -#' @param x a rate-object, vector of two; rate and standard error or observed and person-years. -#' @param y a rate-object, vector of two; rate and standard error or observed and person-years. -#' @param crude set TRUE to use crude rates; default is FALSE. -#' @param SE.method default TRUE; if \code{x} and \code{y} are vectors of observed and -#' person-years, this must be changed to FALSE. -#' -#' @examples -#' \dontrun{ -#' # two rate ratios; silly example with female rectal / breast cancer -#' ## mortality rates -#' data("sire", package = "popEpi") -#' data("sibr", package = "popEpi") -#' -#' BL <- list(per = 2000:2005) -#' -#' re <- lexpand(sire, birth = "bi_date", entry = "dg_date", exit = "ex_date", -#' status = status == 1, breaks = BL, aggre = list(per)) -#' br <- lexpand(sibr, birth = "bi_date", entry = "dg_date", exit = "ex_date", -#' status = status == 1, breaks = BL, aggre = list(per)) -#' -#' r_re <- rate(re, obs = "from0to1", pyrs = "pyrs") -#' r_br <- rate(br, obs = "from0to1", pyrs = "pyrs") -#' -#' rate_ratio(r_re, r_br, SE.method = TRUE) -#' } -#' -#' # manually set rates (0.003 and 0.005) and SEs (0.001 and 0.002) -#' # so that x = y = c('rate', 'SE') -#' rate_ratio(x= c(0.003, 0.001), y= c(0.005, 0.002), SE.method = TRUE) -#' -#' # observed numbers (10 and 20) and person-years (30000 and 40000): -#' rate_ratio(x = c(10, 30000), y = c(20, 40000), SE.method = FALSE) -#' -#' @seealso \code{\link{rate}} -#' -#' @family rate functions -#' -#' @return A vector length of three: rate_ratio, and lower and upper confidence intervals. -#' -#' @export rate_ratio -#' -#' @import data.table -#' @import stats -rate_ratio <- function(x, y, crude = FALSE, SE.method = TRUE) { - if( inherits(x, 'rate') | inherits(y, 'rate') ) { - if(!crude & (!'rate.adj' %in% names(x) | !'rate.adj' %in% names(y))) { - crude <- TRUE - message('Crude rates used') - } - } - - x <- prep.rate.input(x, crude = crude, SE = SE.method) - y <- prep.rate.input(y, crude = crude, SE = SE.method) - - if(SE.method) { - ratio <- x[[1]]/y[[1]] - - # delta method for variance - v0 <- (1/x[[1]])^2*x[[2]]^2 + (1/y[[1]])^2*y[[2]]^2 - - - lo <- ratio - v0*1.96 #exp(log(ratio)-log(v0)*1.96) - hi <- ratio + v0*1.96 #exp(log(ratio)+log(v0)*1.96) - out <- round(data.frame(rate_ratio = ratio, lower = lo, upper = hi), 3) - } - else { - # x and y vector of two:, pyrs - pt <- list() - out <- data.frame() - j <- 1 - for(j in 1:length(x[[1]])) { - pt[[j]] <- poisson.test(x = c(x[[1]][j], y[[1]][j]), T = c(x[[2]][j],y[[2]][j])) - out <- rbind(out, round(data.frame(rate_ratio = pt[[j]]$estimate, - lower = pt[[j]]$conf.int[1], - upper = pt[[j]]$conf.int[2]),3) ) - } - } - if(any(out<0)) { - warning('Negative estimate or confidence intervals. Tip: set SE.method to FALSE when using observations and person-years.') - } - return(out) -} - - - -prep.rate.input <- function(z, crude, SE) { - # this one modulates input to rate_ratio function - if(is.vector(z) && length(z) == 2) { - # z is obs and pyrs OR rate and SE - return(list(z[1], z[2])) - } - else if(inherits(z,'rate')){ - if(!SE) { # obs and pyrs - att <- attributes(z) - setDT(z) - a <- z[, get(att$rate.meta$obs)] - b <- z[, get(att$rate.meta$pyrs)] - } - else { - if(crude) { - a <- z[,rate] - b <- z[,SE.rate] - } - else { - # z is a rate object - a <- z[,rate.adj] - b <- z[,SE.rate.adj] - } - } - } - else{ - stop('Input is not correct: its neighter a vector of two nor a rate object') - } - return(list(a,b)) -} - - + + +#' @title Confidence intervals for the rate ratios +#' @author Matti Rantanen +#' @description Calculate rate ratio with confidence intervals for rate objects or observations and person-years. +#' +#' @details Calculate rate ratio of two age standardized rate objects (see \code{\link{rate}}). +#' Multiple rates for each objects is supported if there are an equal number of rates. +#' Another option is to set \code{x} and \code{y} as a vector of two. +#' \enumerate{ +#' \item rate and its standard error, and set \code{SE.method = TRUE}. +#' \item observations and person-year, and set \code{SE.method = FALSE}. +#' } +#' See examples. +#' +#' +#' @param x a rate-object, vector of two; rate and standard error or observed and person-years. +#' @param y a rate-object, vector of two; rate and standard error or observed and person-years. +#' @param crude set TRUE to use crude rates; default is FALSE. +#' @param SE.method default TRUE; if \code{x} and \code{y} are vectors of observed and +#' person-years, this must be changed to FALSE. +#' +#' @examples +#' \dontrun{ +#' # two rate ratios; silly example with female rectal / breast cancer +#' ## mortality rates +#' data("sire", package = "popEpi") +#' data("sibr", package = "popEpi") +#' +#' BL <- list(per = 2000:2005) +#' +#' re <- lexpand(sire, birth = "bi_date", entry = "dg_date", exit = "ex_date", +#' status = status == 1, breaks = BL, aggre = list(per)) +#' br <- lexpand(sibr, birth = "bi_date", entry = "dg_date", exit = "ex_date", +#' status = status == 1, breaks = BL, aggre = list(per)) +#' +#' r_re <- rate(re, obs = "from0to1", pyrs = "pyrs") +#' r_br <- rate(br, obs = "from0to1", pyrs = "pyrs") +#' +#' rate_ratio(r_re, r_br, SE.method = TRUE) +#' } +#' +#' # manually set rates (0.003 and 0.005) and SEs (0.001 and 0.002) +#' # so that x = y = c('rate', 'SE') +#' rate_ratio(x= c(0.003, 0.001), y= c(0.005, 0.002), SE.method = TRUE) +#' +#' # observed numbers (10 and 20) and person-years (30000 and 40000): +#' rate_ratio(x = c(10, 30000), y = c(20, 40000), SE.method = FALSE) +#' +#' @seealso \code{\link{rate}} +#' +#' @family rate functions +#' +#' @return A vector length of three: rate_ratio, and lower and upper confidence intervals. +#' +#' @export rate_ratio +#' +#' @import data.table +#' @import stats +rate_ratio <- function(x, y, crude = FALSE, SE.method = TRUE) { + if( inherits(x, 'rate') | inherits(y, 'rate') ) { + if(!crude & (!'rate.adj' %in% names(x) | !'rate.adj' %in% names(y))) { + crude <- TRUE + message('Crude rates used') + } + } + + x <- prep.rate.input(x, crude = crude, SE = SE.method) + y <- prep.rate.input(y, crude = crude, SE = SE.method) + + if(SE.method) { + ratio <- x[[1]]/y[[1]] + + # delta method for variance + v0 <- (1/x[[1]])^2*x[[2]]^2 + (1/y[[1]])^2*y[[2]]^2 + + + lo <- ratio - v0*1.96 #exp(log(ratio)-log(v0)*1.96) + hi <- ratio + v0*1.96 #exp(log(ratio)+log(v0)*1.96) + out <- round(data.frame(rate_ratio = ratio, lower = lo, upper = hi), 3) + } + else { + # x and y vector of two:, pyrs + pt <- list() + out <- data.frame() + j <- 1 + for(j in 1:length(x[[1]])) { + pt[[j]] <- poisson.test(x = c(x[[1]][j], y[[1]][j]), T = c(x[[2]][j],y[[2]][j])) + out <- rbind(out, round(data.frame(rate_ratio = pt[[j]]$estimate, + lower = pt[[j]]$conf.int[1], + upper = pt[[j]]$conf.int[2]),3) ) + } + } + if(any(out<0)) { + warning('Negative estimate or confidence intervals. Tip: set SE.method to FALSE when using observations and person-years.') + } + return(out) +} + + + +prep.rate.input <- function(z, crude, SE) { + # this one modulates input to rate_ratio function + if(is.vector(z) && length(z) == 2) { + # z is obs and pyrs OR rate and SE + return(list(z[1], z[2])) + } + else if(inherits(z,'rate')){ + if(!SE) { # obs and pyrs + att <- attributes(z) + setDT(z) + a <- z[, get(att$rate.meta$obs)] + b <- z[, get(att$rate.meta$pyrs)] + } + else { + if(crude) { + a <- z[,rate] + b <- z[,SE.rate] + } + else { + # z is a rate object + a <- z[,rate.adj] + b <- z[,SE.rate.adj] + } + } + } + else{ + stop('Input is not correct: its neighter a vector of two nor a rate object') + } + return(list(a,b)) +} + + \ No newline at end of file diff --git a/R/lexpand.R b/R/lexpand.R index 4786527..24e595c 100644 --- a/R/lexpand.R +++ b/R/lexpand.R @@ -1,940 +1,940 @@ - -#' @title Split case-level observations -#' @author Joonas Miettinen -#' @description Given subject-level data, data is split -#' by calendar time (\code{per}), \code{age}, and follow-up -#' time (\code{fot}, from 0 to the end of follow-up) -#' into subject-time-interval rows according to -#' given \code{breaks} and additionally processed if requested. -#' @param data dataset of e.g. cancer cases as rows -#' @param birth birth time in date format -#' or fractional years; string, symbol or expression -#' @param entry entry time in date format -#' or fractional years; string, symbol or expression -#' @param exit exit from follow-up time in date -#' format or fractional years; string, symbol or expression -#' @param event advanced: time of possible event differing from \code{exit}; -#' typically only used in certain SIR/SMR calculations - see Details; -#' string, symbol or expression -#' @param status variable indicating type of event at \code{exit} or \code{event}; -#' e.g. \code{status = status != 0}; expression or quoted variable name -#' @param entry.status input in the same way as \code{status}; -#' status at \code{entry}; see Details -#' @param id optional; an id variable; e.g. \code{id = my_id}; -#' string, symbol or expression -#' @param overlapping advanced, logical; if \code{FALSE} AND if \code{data} contains -#' multiple rows per subject, -#' ensures that the timelines of \code{id}-specific rows do not overlap; -#' this ensures e.g. that person-years are only computed once per subject -#' in a multi-state paradigm -#' @param aggre e.g. \code{aggre = list(sex, fot)}; -#' a list of unquoted variables and/or expressions thereof, -#' which are interpreted as factors; data events and person-years will -#' be aggregated by the unique combinations of these; see Details -#' @param aggre.type one of \code{c("unique","cartesian")}; -#' can be abbreviated; see Details -#' @param breaks a named list of vectors of time breaks; -#' e.g. \code{breaks = list(fot=0:5, age=c(0,45,65,Inf))}; see Details -#' @param drop logical; if \code{TRUE}, drops all resulting rows -#' after splitting that reside outside -#' the time window as defined by the given breaks (all time scales) -#' @param pophaz a dataset of population hazards to merge -#' with split data; see Details -#' @param pp logical; if \code{TRUE}, computes Pohar-Perme weights using -#' \code{pophaz}; adds variable with reserved name \code{pp}; -#' see Details for computing method -#' @param subset a logical vector or any logical condition; data is subsetted -#' before splitting accordingly -#' @param merge logical; if \code{TRUE}, retains all -#' original variables from the data -#' @param verbose logical; if \code{TRUE}, the function is chatty and -#' returns some messages along the way -#' @param ... e.g. \code{fot = 0:5}; instead of specifying a \code{breaks} list, -#' correctly named breaks vectors can be given -#' for \code{fot}, \code{age}, and \code{per}; these override any breaks in the -#' \code{breaks} list; see Examples -#' -#' -#' -#' @details -#' \strong{Basics} -#' -#' \code{\link{lexpand}} splits a given data set (with e.g. cancer diagnoses -#' as rows) to subintervals of time over -#' calendar time, age, and follow-up time with given time breaks -#' using \code{\link{splitMulti}}. -#' -#' The dataset must contain appropriate -#' \code{Date} / \code{IDate} / \code{date} format or -#' other numeric variables that can be used -#' as the time variables. -#' -#' You may take a look at a simulated cohort -#' \code{\link{sire}} as an example of the -#' minimum required information for processing data with \code{lexpand}. -#' -#' Many arguments can be supplied as a character string naming the appropriate -#' variable (e.g. \code{"sex"}), as a symbol (e.g. \code{sex}) or as an expression -#' (e.g. \code{factor(sex, 0:1, c("m", "f"))}) for flexibility. -#' -#' \strong{Breaks} -#' -#' You should define all breaks as left inclusive and right exclusive -#' time points (e.g.\code{[a,b)} ) -#' for 1-3 time dimensions so that the last member of a breaks vector -#' is a meaningful "final upper limit", -#' e.g. \code{per = c(2002,2007,2012)} -#' to create a last subinterval of the form \code{[2007,2012)}. -#' -#' All breaks are explicit, i.e. if \code{drop = TRUE}, -#' any data beyond the outermost breaks points are dropped. -#' If one wants to have unspecified upper / lower limits on one time scale, -#' use \code{Inf}: e.g. \code{breaks = list(fot = 0:5, age = c(0,45,Inf))}. -#' Breaks for \code{per} can also be given in -#' \code{Date}/\code{IDate}/\code{date} format, whereupon -#' they are converted to fractional years before used in splitting. -#' -#' The \code{age} time scale can additionally -#' be automatically split into common age grouping schemes -#' by naming the scheme with an appropriate character string: -#' -#' \itemize{ -#' \item \code{"18of5"}: age groups 0-4, 5-9, 10-14, ..., 75-79, 80-84, 85+ -#' \item \code{"20of5"}: age groups 0-4, 5-9, 10-14, ..., 85-89, 90-94, 95+ -#' \item \code{"101of1"}: age groups 0, 1, 2, ..., 98, 99, 100+ -#' } -#' -#' \strong{Time variables} -#' -#' If any of the given time variables -#' (\code{birth}, \code{entry}, \code{exit}, \code{event}) -#' is in any kind of date format, they are first coerced to -#' fractional years before splitting -#' using \code{\link{get.yrs}} (with \code{year.length = "actual"}). -#' -#' Sometimes in e.g. SIR/SMR calculation one may want the event time to differ -#' from the time of exit from follow-up, if the subject is still considered -#' to be at risk of the event. If \code{event} is specified, the transition to -#' \code{status} is moved to \code{event} from \code{exit} -#' using \code{\link[Epi]{cutLexis}}. See Examples. -#' -#' \strong{The status variable} -#' -#' The statuses in the expanded output (\code{lex.Cst} and \code{lex.Xst}) -#' are determined by using either only \code{status} or both \code{status} -#' and \code{entry.status}. If \code{entry.status = NULL}, the status at entry -#' is guessed according to the type of variable supplied via \code{status}: -#' For numeric variables it will be zero, for factors the first level -#' (\code{levels(status)[1]}) and otherwise the first unique value in alphabetical -#' order (\code{sort(unique(status))[1]}). -#' -#' Using numeric or factor status -#' variables is strongly recommended. Logical expressions are also allowed -#' (e.g. \code{status = my_status != 0L}) and are converted to integer internally. -#' -#' \strong{Merging population hazard information} -#' -#' To enable computing relative/net survivals with \code{\link{survtab}} -#' and \code{\link{relpois}}, \code{lexpand} merges an appropriate -#' population hazard data (\code{pophaz}) to the expanded data -#' before dropping rows outside the specified -#' time window (if \code{drop = TRUE}). \code{pophaz} must, for this reason, -#' contain at a minimum the variables named -#' \code{agegroup}, \code{year}, and \code{haz}. \code{pophaz} may contain additional variables to specify -#' different population hazard levels in different strata; e.g. \code{popmort} includes \code{sex}. -#' All the strata-defining variables must be present in the supplied \code{data}. \code{lexpand} will -#' automatically detect variables with common names in the two datasets and merge using them. -#' -#' Currently \code{year} must be an integer variable specifying the appropriate year. \code{agegroup} -#' must currently also specify one-year age groups, e.g. \code{popmort} specifies 101 age groups -#' of length 1 year. In both -#' \code{year} and \code{agegroup} variables the values are interpreted as the lower bounds of intervals -#' (and passed on to a \code{cut} call). The mandatory variable \code{haz} -#' must specify the appropriate average rate at the person-year level; -#' e.g. \code{haz = -log(survProb)} where \code{survProb} is a one-year conditional -#' survival probability will be the correct hazard specification. -#' -#' The corresponding \code{pophaz} population hazard value is merged by using the mid points -#' of the records after splitting as reference values. E.g. if \code{age=89.9} at the start -#' of a 1-year interval, then the reference age value is \code{90.4} for merging. -#' This way we get a "typical" population hazard level for each record. -#' -#' \strong{Computing Pohar-Perme weights} -#' -#' If \code{pp = TRUE}, Pohar-Perme weights -#' (the inverse of cumulative population survival) are computed. This will -#' create the new \code{pp} variable in the expanded data. \code{pp} is a -#' reserved name and \code{lexpand} throws exception if a variable with that name -#' exists in \code{data}. -#' -#' When a survival interval contains one or several rows per subject -#' (e.g. due to splitting by the \code{per} scale), -#' \code{pp} is cumulated from the beginning of the first record in a survival -#' interval for each subject to the mid-point of the remaining time within that -#' survival interval, and that value is given for every other record -#' that a given person has within the same survival interval. -#' -#' E.g. with 5 rows of duration \code{1/5} within a survival interval -#' \code{[0,1)]}, \code{pp} is determined for all records by a cumulative -#' population survival from \code{0} to \code{0.5}. Th existing accuracy is used, -#' so that the weight is cumulated first up to the end of the second row -#' and then over the remaining distance to the mid-point (first to 0.4, then to -#' 0.5). This ensures that more accurately merged population hazards are fully -#' used. -#' -#' \strong{Event not at end of follow-up & overlapping time lines} -#' -#' \code{event} may be used if the event indicated by \code{status} should -#' occur at a time differing from \code{exit}. If \code{event} is defined, -#' \code{cutLexis} is used on the data set after coercing it to the \code{Lexis} -#' format and before splitting. Note that some values of \code{event} are allowed -#' to be \code{NA} as with \code{cutLexis} to accommodate observations -#' without an event occurring. -#' -#' Additionally, setting \code{overlapping = FALSE} ensures that (irrespective -#' of using \code{event}) the each subject defined by \code{id} only has one -#' continuous time line instead of possibly overlapping time lines if -#' there are multiple rows in \code{data} by \code{id}. -#' -#' -#' \strong{Aggregating} -#' -#' Certain analyses such as SIR/SMR calculations require tables of events and -#' person-years by the unique combinations (interactions) of several variables. -#' For this, \code{aggre} can be specified as a list of such variables -#' (preferably \code{factor} variables but not mandatory) -#' and any arbitrary functions of the -#' variables at one's disposal. E.g. -#' -#' \code{aggre = list(sex, agegr = cut(dg_age, 0:100))} -#' -#' would tabulate events and person-years by sex and an ad-hoc age group -#' variable. Every ad-hoc-created variable should be named. -#' -#' \code{fot}, \code{per}, and \code{age} are special reserved variables which, -#' when present in the \code{aggre} list, are output as categories of the -#' corresponding time scale variables by using -#' e.g. -#' -#' \code{cut(fot, breaks$fot, right=FALSE)}. -#' -#' This only works if -#' the corresponding breaks are defined in \code{breaks} or via "\code{...}". -#' E.g. -#' -#' \code{aggre = list(sex, fot.int = fot)} with -#' -#' \code{breaks = list(fot=0:5)}. -#' -#' The output variable \code{fot.int} in the above example will have -#' the lower limits of the appropriate intervals as values. -#' -#' \code{aggre} as a named list will output numbers of events and person-years -#' with the given new names as categorizing variable names, e.g. -#' \code{aggre = list(follow_up = fot, gender = sex, agegroup = age)}. -#' -#' The output table has person-years (\code{pyrs}) and event counts -#' (e.g. \code{from0to1}) as columns. Event counts are the numbers of transitions -#' (\code{lex.Cst != lex.Xst}) or the \code{lex.Xst} value at a subject's -#' last record (subject possibly defined by \code{id}). -#' -#' If \code{aggre.type = "unique"} (alias \code{"non-empty"}), -#' the above results are computed for existing -#' combinations of expressions given in \code{aggre}, but also for non-existing -#' combinations if \code{aggre.type = "cartesian"} (alias \code{"full"}). E.g. if a -#' factor variable has levels \code{"a", "b", "c"} but the data is limited -#' to only have levels \code{"a", "b"} present -#' (more than zero rows have these level values), the former setting only -#' computes results for \code{"a", "b"}, and the latter also for \code{"c"} -#' and any combination with other variables or expression given in \code{aggre}. -#' In essence, \code{"cartesian"} forces also combinations of variables used -#' in \code{aggre} that have no match in data to be shown in the result. -#' -#' If \code{aggre} is not \code{NULL} and \code{pophaz} has been supplied, -#' \code{lexpand} also aggregates the expected counts of events, which -#' appears in the output data by the reserved name \code{d.exp}. Additionally, -#' having \code{pp = TRUE} causes \code{lexpand} to also compute various -#' Pohar-Perme weighted figures necessary for computing Pohar-Perme net survivals -#' with \code{\link{survtab_ag}}. This can be slow, so consider what is really -#' needed. The Pohar-Perme weighted figures have the suffix \code{.pp}. -#' -#' @return -#' If \code{aggre = NULL}, returns -#' a \code{data.table} or \code{data.frame} -#' (depending on \code{options("popEpi.datatable")}; see \code{?popEpi}) -#' object expanded to accommodate split observations with time scales as -#' fractional years and \code{pophaz} merged in if given. Population -#' hazard levels in new variable \code{pop.haz}, and Pohar-Perme -#' weights as new variable \code{pp} if requested. -#' -#' If \code{aggre} is defined, returns a long-format -#' \code{data.table}/\code{data.frame} with the variable \code{pyrs} (person-years), -#' and variables for the counts of transitions in state or state at end of -#' follow-up formatted \code{fromXtoY}, where \code{X} and \code{Y} are -#' the states transitioned from and to, respectively. The data may also have -#' the columns \code{d.exp} for expected numbers of cases and various -#' Pohar-Perme weighted figures as identified by the suffix \code{.pp}; see -#' Details. -#' -#' -#' @examples -#' \dontrun{ -#' ## prepare data for e.g. 5-year cohort survival calculation -#' x <- lexpand(sire, breaks=list(fot=seq(0, 5, by = 1/12)), -#' birth = bi_date, entry = dg_date, exit = ex_date, -#' status = status != 0, pophaz=popmort) -#' -#' ## prepare data for e.g. 5-year "period analysis" for 2008-2012 -#' BL <- list(fot = seq(0, 5, by = 1/12), per = c("2008-01-01", "2013-01-01")) -#' x <- lexpand(sire, breaks = BL, -#' birth = bi_date, entry = dg_date, exit = ex_date, -#' pophaz=popmort, status = status != 0) -#' -#' ## aggregating -#' BL <- list(fot = 0:5, per = c("2003-01-01","2008-01-01", "2013-01-01")) -#' ag <- lexpand(sire, breaks = BL, status = status != 0, -#' birth = bi_date, entry = dg_date, exit = ex_date, -#' aggre=list(sex, period = per, surv.int = fot)) -#' -#' ## aggregating even more -#' ag <- lexpand(sire, breaks = BL, status = status != 0, -#' birth = bi_date, entry = dg_date, exit = ex_date, -#' aggre=list(sex, period = per, surv.int = fot), -#' pophaz = popmort, pp = TRUE) -#' -#' ## using "..." -#' x <- lexpand(sire, fot=0:5, status = status != 0, -#' birth = bi_date, entry = dg_date, exit = ex_date, -#' pophaz=popmort) -#' -#' x <- lexpand(sire, fot=0:5, status = status != 0, -#' birth = bi_date, entry = dg_date, exit = ex_date, -#' aggre=list(sex, surv.int = fot)) -#' -#' ## using the "event" argument: it just places the transition to given "status" -#' ## at the "event" time instead of at the end, if possible using cutLexis -#' x <- lexpand(sire, status = status, event = dg_date, -#' birth = bi_date, entry = dg_date, exit = ex_date,) -#' -#' ## aggregating with custom "event" time -#' ## (the transition to status is moved to the "event" time) -#' x <- lexpand(sire, status = status, event = dg_date, -#' birth = bi_date, entry = dg_date, exit = ex_date, -#' per = 1970:2014, age = c(0:100,Inf), -#' aggre = list(sex, year = per, agegroup = age)) -#' -#' } -#' -#' @import data.table -#' @import Epi -#' @family splitting functions -#' @family aggregation functions -#' @seealso -#' \code{\link[Epi]{Lexis}}, \code{\link{popmort}} -#' @export -lexpand <- function(data, - birth=NULL, entry=NULL, exit=NULL, event=NULL, - status = status != 0, - entry.status = NULL, - breaks = list(fot=c(0,Inf)), - id = NULL, - overlapping = TRUE, - aggre = NULL, - aggre.type = c("unique", "cartesian"), - drop=TRUE, - pophaz = NULL, pp = TRUE, - subset = NULL, - merge=TRUE, verbose = FALSE, - ...) { - start_time <- proc.time() - - TF <- environment() - PF <- parent.frame(1L) - - ## data checks - if ( missing(data) || nrow(data) == 0) stop("no data found") - - if (!is.data.frame(data)) stop("data must be a data.frame or data.table") - - ## to instate global variables to appease R CMD CHECK - .EACHI <- lex.status <- lexpand.id <- lex.exit <- lex.birth <- - lex.entry <- lex.event <- temp.id <- cd <- fot <- age <- per <- - lex.id <- lex.multi <- pop.haz <- lex.Cst <- lex.Xst <- lex.dur <- NULL - - - ## test conflicting variable names ------------------------------------------- - added_vars <- c("fot", "per", "age", "lex.id", "lex.dur", "lex.Xst", "lex.Cst") - if (!is.null(pophaz)) added_vars <- if (pp) c(added_vars, "pp", "pop.haz") else c(added_vars, "pop.haz") - conflicted_vars <- intersect(added_vars, names(data)) - - if (merge && length(conflicted_vars) > 0) { - conflicted_vars <- paste0("'", conflicted_vars, "'", collapse = ", ") - warning("'data' already had variable(s) named ", conflicted_vars, " which lexpand will create, and you have merge = TRUE; this may result in unexpected problems. Rename the variable(s)?") - } - rm(added_vars, conflicted_vars) - - ## test aggre type ----------------------------------------------------------- - aggre.type <- match.arg(aggre.type[1L], c("cartesian", "non-empty", "unique", "cross-product", "full")) - if (aggre.type == "cross-product") { - aggre.type <- "cartesian" - warning("aggre.type value 'cross-product' deprecated and renamed to 'cartesian'; please use that in the future") - } - - ## subsetting----------------------------------------------------------------- - ## no copy taken of data! - subset <- substitute(subset) - subset <- evalLogicalSubset(data, subset) - - ## prepping time variables --------------------------------------------------- - l <- substitute(list(birth, entry, exit, event, status, entry.status, id)) - rm(birth, entry, exit, event, status, entry.status, id) - - lc <- unlist(lapply(l, deparse)) - lc <- lc[-1] ## first always "list" - - wh <- which(lc != "NULL") - lex_vars <- c("lex.birth","lex.entry","lex.exit","lex.event", "lex.status", "lex.entry.status", "lexpand.id")[wh] - if (any(!c("lex.birth", "lex.entry", "lex.exit", "lex.status") %in% lex_vars)) stop("birth, entry, exit and status are mandatory") - - l <- eval(l, envir = data[subset, ], enclos = PF) - l[-wh] <- NULL - - - ## vars can be given as character strings of variable names - isChar <- sapply(l, is.character, simplify = TRUE) - if (any(isChar)) { - isShort <- sapply(l, function(x) {length(x) == 1L}, simplify = TRUE) - whOneChar <- which(isShort & isChar) - - whBadChar <- NULL - if (length(whOneChar) > 0) { - testBadChar <- unlist(l[whOneChar]) - whBadChar <- whOneChar[!testBadChar %in% names(data)] - } - - if (length(whBadChar) > 0) { - - badChar <- l[whBadChar] - badChar <- paste0(badChar[1:min(length(badChar), 5L)], collapse = ", ") - stop("Variables given as a character of length one are interpreted as variable names in data, - but some given characters were not found in data; - check names or input as factor/Date; - first five bad names: ", badChar) - } - - l[whOneChar] <- lapply(l[whOneChar], function(x) {data[subset, ][[x]]}) - } - - - l <- as.data.table(l) - setnames(l, names(l), lex_vars) - - - rm(lex_vars) - if (!all(c("lex.birth","lex.entry","lex.exit","lex.status") %in% names(l))) { - stop("birth, entry, exit and status are mandatory, but at least one was misspecified/NULL") - } - - if (is.logical(l$lex.status)) l[, lex.status := as.integer(lex.status)] - if (is.null(l$lexpand.id)) l[, lexpand.id := 1:.N] - - ## checks for merging style -------------------------------------------------- - if (!is.null(pophaz)) { - all_names_present(pophaz, c("agegroup","year","haz")) - othMergeVars <- setdiff(names(pophaz), c("agegroup","year","haz")) - badOthMergeVars <- setdiff(othMergeVars, names(data)) - if (length(badOthMergeVars) > 0) { - badOthMergeVars <- paste0("'", badOthMergeVars, "'", collapse = ", ") - stop("Following variables exist in pophaz but do not exist in data: ", badOthMergeVars, ". Make sure data and pophaz contain variables with the same names that you intend to merge by.") - } - } - - if (is.null(pophaz)) { - comp_pp <- FALSE - } else { - comp_pp <- TRUE - } - - ## internally we have "delta" and "actual" methods, - ## the latter being experimental and not visible in the documentation. - ## "delta" is always used if pp = TRUE. - ## it is possible to choose pp = "actual" as well, though, if you know - ## about it. - comp_pp <- FALSE - if (is.logical(pp) && pp) pp <- "delta" - - if (!is.null(pp) && is.character(pp)) { - pp <- match.arg(pp, c("delta", "actual")) - comp_pp <- TRUE - } - if (comp_pp && "pp" %in% names(data)) stop("variable named 'pp' in data; this is a reserved name for pohar-perme weights, please rename / remove the variable in data") - - ## ensure given breaks make any sense ---------------------------------------- - - bl <- list(...) - lna <- names(bl) - bad_lna <- setdiff(lna, c("fot","per","age")) - if (length(bad_lna) > 0) { - bad_lna <- paste0("'", bad_lna, "'", collapse = ", ") - stop("only arguments named 'fot', 'per' or 'age' currently allowed to be passed via '...'; did you mistype an argument? bad args: ", bad_lna) - } - lna <- intersect(names(bl), c("fot","per","age")) - if (length(lna) > 0) { - bl <- bl[lna] - if (!is.null(breaks)) breaks[lna] <- NULL - breaks <- c(breaks, bl) - } - rm(bl, lna) - - brna <- names(breaks) - if (length(brna) != length(breaks)) { - stop("all elements in breaks list must be named, e.g. list(fot = 0:5, age=c(0,45,65,Inf))") - } - - brna <- intersect(brna, c("fot","per","age")) - if (length(brna) == 0) { - breaks$fot <- c(0,Inf) - } - - if ("age" %in% brna && is.character(breaks$age)) { - schemeNames <- c("18of5", "20of5", "101of1") - if (!breaks$age %in% schemeNames) stop("You supplied '", breaks$age, "' as breaks for the age scale, but allowed character strings are: ", paste0("'", schemeNames, "'", collapse = ",")) - brSchemes <- list(c(seq(0, 85, 5)), c(seq(0, 95, 5), Inf), c(0:100, Inf)) - names(brSchemes) <- paste0("age_", schemeNames) - breaks$age <- brSchemes[paste0("age_",breaks$age)] - } - - - if (any(sapply(breaks, length) == 1L)) { - stop("any given non-null vector of breaks must have more than one break!") - } - - # convert to fractional years ------------------------------------------------ - - char2date <- function(obj) { - if (is.character(obj) || inherits(obj, "date")) { - return(as.IDate(obj)) - } else { - return(obj) - } - } - - date2yrs <- function(obj) { - if (is.Date(obj) || inherits(obj, "date")) { - get.yrs(obj, year.length = "actual") - } else { - obj - } - } - - breaks <- lapply(breaks, char2date) - breaks <- lapply(breaks, date2yrs) - - time_vars <- intersect(names(l), c("lex.birth", "lex.entry", "lex.exit","lex.event")) - l[, (time_vars) := lapply(.SD, date2yrs) , .SDcols = time_vars] - - if (verbose) cat("given birth, entry, exit, status etc. variables after coercion to numeric \n") - if (verbose) print(l) - - # check data consistency for overlapping = FALSE ----------------------------- - ## not allowed: for any one unique subject to be true for - ## multiple rows (if overlapping = TRUE): - ## * same event values - ## * same entry values - if (!overlapping) { - if ("lex.event" %in% names(l)) { - if (all(is.na(l$lex.event))) stop("ALL 'event' values are NA; if this is as intended, please use event = NULL instead") - - if (any(duplicated(l, by = c("lexpand.id", "lex.event")))) { - stop("subject(s) defined by lex.id had several rows where 'event' time had the same value, which is not supported with overlapping = FALSE; perhaps separate them by one day?") - } - if (any(l[!is.na(lex.event), lex.entry == lex.event])) { - stop("some rows have simultaneous 'entry' and 'event', which is not supported with overlapping = FALSE; perhaps separate them by one day?") - } - } else if (any(duplicated(l, by = c("lexpand.id", "lex.exit")))) { - stop("subject(s) defined by lex.id had several rows where 'exit' time had the same value, which is not supported without 'event' defined; use 'event' or perhaps separate them by one day?") - } - - - } - - - # dropping unuseful records -------------------------------------------------- - test_times <- function(condition, msg, old_subset=l_subset, DT=l) { - - condition <- substitute(condition) - condition <- eval(condition, envir = DT, enclos = parent.frame(1L)) - - new_subset <- old_subset & !(condition & !is.na(condition)) - old_n <- sum(old_subset) - new_n <- sum(new_subset) - - if (new_n == 0L) { - stop("dropping rows where ", msg, " resulted in zero rows left. likely problem: misdefined time variables") - } - - if (new_n < old_n) { - message(paste0("dropped ", old_n-new_n, " rows where ", msg)) - } - return(new_subset) - } - - l_subset <- rep(TRUE, nrow(l)) - - l_subset <- test_times(is.na(lex.birth), "birth values are missing") - l_subset <- test_times(is.na(lex.entry), "entry values are missing") - l_subset <- test_times(is.na(lex.exit), "exit values are missing") - - if (!is.null(breaks$per)) { - l_subset <- test_times(lex.exit < min(breaks$per), "subjects left follow-up before earliest per breaks value") - } - if (!is.null(breaks$age)) { - l_subset <- test_times(lex.exit - lex.birth < min(breaks$age), "subjects left follow-up before lowest age breaks value") - } - if (!is.null(breaks$fot)) { - l_subset <- test_times(lex.exit - lex.entry < min(breaks$fot), "subjects left follow-up before lowest fot breaks value") - } - l_subset <- test_times(lex.birth >= lex.exit, "birth >= exit") - l_subset <- test_times(lex.entry == lex.exit, "entry == exit") - l_subset <- test_times(lex.entry > lex.exit, "entry > exit") - l_subset <- test_times(lex.birth > lex.entry, "birth > entry") - if (!is.null(l$lex.event)) { - l_subset <- test_times(lex.event > lex.exit, "event > exit") - l_subset <- test_times(lex.event < lex.entry, "event < entry") - } - l <- l[l_subset] - - if (verbose) cat("Time taken by checks, prepping and test: ", timetaken(start_time), "\n") - - # Lexis coercion ------------------------------------------------------------- - - ## status definitions - setnames(l, "lex.status", "lex.Xst") - if ("lex.entry.status" %in% names(l)) { - setnames(l, "lex.entry.status", "lex.Cst") - } else { - if (is.factor(l$lex.Xst)) { - l[, lex.Cst := factor(levels(lex.Xst)[1L], levels=levels(lex.Xst))] - } else if (is.double(l$lex.Xst)) { - l[, lex.Cst := 0] - } else if (is.integer(l$lex.Xst)) { - l[, lex.Cst := 0L] - } else { - l[, lex.Cst := sort(unique(lex.Xst))[1L]] - } - - } - - # ensure common labels for factors etc. - harmonizeStatuses(x = l, C = "lex.Cst", X = "lex.Xst") - - ## time scales and duration - l[, lex.dur := lex.exit - lex.entry] - l[, fot := 0] - setnames(l, "lex.entry", "per") - l[, age := per-lex.birth] - setnames(l, "lexpand.id", "lex.id") - - ## for merging data with l later - if (merge) { - idt <- data.table(temp.id = 1:nrow(l)) - l[, temp.id := 1:.N] - } - - ## crop time scale values to obey breaks limits and drop if necessary - ## NOTE: goes wrong if need to compute pp weights! - # if (drop && !pp) { - # intelliCrop(x = l, breaks = breaks, allScales = c("fot", "per", "age"), cropStatuses = TRUE) - # l <- intelliDrop(x = l, breaks = breaks, dropNegDur = TRUE) - # } - - - setcolsnull(l, colorder=TRUE, soft=TRUE, - keep = c("lex.id","fot","per","age", - "lex.dur", "lex.Cst", "lex.Xst", "lex.event", "temp.id")) - setattr(l, "class", c("Lexis", "data.table", "data.frame")) - setattr(l, "time.scales", c("fot","per","age")) - setattr(l, "time.since", c("","","")) - - if (verbose) cat("data just after Lexis coercion: \n") - if (verbose) print(l) - - # event not at exit time ----------------------------------------------------- - - if ("lex.event" %in% names(l)) { - - if (!overlapping) { - - ## using lex.event time, ensure coherence of lex.Cst & lex.Xst - ## before cutLexis() - tmpFE <- makeTempVarName(l, pre = "fot_end_") - l[, (tmpFE) := fot + lex.dur] - setkeyv(l, c("lex.id", "lex.event", tmpFE)) - tmpLX <- makeTempVarName(l, pre = "lag_lex.Xst_") - l[, (tmpLX) := shift(lex.Xst, n = 1, type = "lag"), by = lex.id] - l[!is.na(get(tmpLX)), lex.Cst := get(tmpLX)] - l[, c(tmpFE, tmpLX) := NULL] - rm(tmpFE, tmpLX) - - } - - if (verbose) cutt <- proc.time() - setDF(l) - setattr(l, "class", c("Lexis", "data.frame")) - l <- Epi::cutLexis(l, cut = l$lex.event, timescale = "per", new.state = l$lex.Xst, precursor.states = unique(l$lex.Cst)) - setDT(l) - setattr(l, "class", c("Lexis", "data.table", "data.frame")) - if (verbose) cat("Time taken by cutLexis when defining event time points: ", timetaken(cutt), "\n") - - if (verbose) cat("Data just after using cutLexis: \n") - if (verbose) print(l[]) - - } - - - # overlapping timelines? ----------------------------------------------------- - - if (!overlapping && any(duplicated(l$lex.id))) { - tmpFE <- makeTempVarName(l, pre = "fot_end_") - l[, (tmpFE) := fot + lex.dur] - ## don't keep duplicated rows: - ## same end points imply fully overlapping time lines - ## e.g. - ## ---> - ## -> - ## --> - ## results in - ## -> - ## ---> - ## we only keep the longest time line with a unique end point. - - # setkeyv(l, c("lex.id", tmpFE, "fot")) - tmpLE <- intersect(names(l), "lex.event") - LEval <- if (length(tmpLE) == 0) NULL else -1 - - setorderv(l, c("lex.id", tmpFE, tmpLE, "fot"), c(1,1,LEval,1)) - l <- unique(l, by = c("lex.id", tmpFE)) - - ## end points are kept but starting points are "rolled" - ## from first to last row by lex.id to ensure non-overlappingness; e.g. - ## -> - ## ---> - ## results in - ## -> - ## -> - # setkeyv(l, c("lex.id", tmpFE)) - # setorderv(l, c("lex.id", tmpLE, tmpFE), c(1, LEval, 1)) - setkeyv(l, c("lex.id", tmpLE, tmpFE)) - - if (verbose) cat("data just before fixing overlapping time lines \n") - if (verbose) print(l) - l[, lex.dur := get(tmpFE) - c(min(fot), get(tmpFE)[-.N]), by = lex.id] - l[, fot := get(tmpFE) - lex.dur] - cumDur <- l[, list(age = min(age), per = min(per), cd = c(0, cumsum(lex.dur)[-.N])), by = lex.id] - cumDur[, age := age+cd] - cumDur[, per := per+cd] - l[, age := cumDur$age] - l[, per := cumDur$per] - l[, (tmpFE) := NULL]; rm(cumDur) - - - ## if event used, first row up to event, second row from first event to etc... - } - - setcolsnull(l, "lex.event", soft = TRUE) ## note: lex.event needed in overlapping procedures - - if (verbose) cat("time and status variables before splitting: \n") - if (verbose) print(l) - if ("id" %in% ls()) rm("id") - - - # splitting ------------------------------------------------------------------ - - ## determine whether to drop data only after splitting and merging - drop_after <- FALSE - if (drop == TRUE && comp_pp) { - drop <- FALSE - drop_after <- TRUE - } - - forceLexisDT(l, breaks = list(fot = NULL, per = NULL, age = NULL), - allScales = c("fot", "per", "age")) - if (verbose) splittime <- proc.time() - l <- splitMulti(l, breaks = breaks, - drop = drop, verbose=FALSE, merge = TRUE) - setDT(l) - setkey(l, lex.id, fot) - l[, lex.multi := 1:.N, by = lex.id] - if (verbose) cat("Time taken by splitting:", timetaken(splittime), "\n") - - # merging other variables from data ------------------------------------------ - - if (merge) { - setkey(l, temp.id) - - temp <- data.table(idt, data[subset & !is.na(subset), ][l_subset, ]) - setkey(temp, temp.id) - - l <- temp[l] - - rm(temp, idt) - setcolsnull(l, "temp.id") - - lex_vars <- c("lex.id","lex.multi","fot","per","age", "lex.dur", "lex.Cst", "lex.Xst") - setcolorder(l, c(lex_vars, setdiff(names(l), lex_vars))) - } - rm(data, subset, l_subset) - - ## aggregating checks -------------------------------------------------------- - ## NOTE: aggre evaled here using small data subset to check that all needed - ## variables are found, etc. - aggSub <- substitute(aggre) - agTest <- evalPopArg(arg = aggSub, data = l[1:min(10L, .N), ], - enclos = PF, recursive = TRUE, DT = TRUE) - agTy <- attr(agTest, "arg.type") - if (is.null(agTy)) agTy <- "NULL" - aggSub <- attr(agTest, "quoted.arg") - agVars <- attr(agTest, "all.vars") - rm(aggre) - - # merging pophaz and pp-weighting -------------------------------------------- - if (!is.null(pophaz)) { - - pophaztime <- proc.time() - - if (any(c("haz", "pop.haz") %in% names(l))) stop("case data had variable(s) named 'haz' / 'pop.haz', which are reserved for lexpand's internal use. rename/remove them please.") - # merge surv.int information ----------------------------------------------- - NULL_FOT <- FALSE - if (is.null(breaks$fot)) { - breaks$fot <- l[, c(0, max(fot+lex.dur))] - NULL_FOT <- TRUE - } - - breaks$fot <- sort(unique(breaks$fot)) - # handle pophaz data ------------------------------------------------------- - - if (!"haz" %in% names(pophaz)) stop("no 'haz' variable in pophaz; please rename you hazard variable to 'haz'") - yBy <- xBy <- setdiff(names(pophaz), c("haz")) - if (c("year") %in% yBy) xBy[yBy == "year"] <- "per" - if (c("agegroup") %in% yBy) xBy[yBy == "agegroup"] <- "age" - yByOth <- setdiff(yBy, c("year", "agegroup")) - - if (any(!yByOth %in% names(l))) - stop("Following variable names not common between pophaz and data: ", paste0("'", yByOth[!yByOth %in% names(l)], "'", collapse = ", ")) - - l <- cutLowMerge(x = l, y = pophaz, by.x = xBy, by.y = yBy, all.x = TRUE, - all.y = FALSE, mid.scales = c("per", "age"), old.nums = TRUE) - setnames(l, "haz", "pop.haz") - - ## check if l's merging time variables were within pophaz's limits --------- - nNA <- l[is.na(pop.haz), .N] - if (nNA > 0) message("WARNING: after merging pophaz, ", nNA, " rows in split data have NA hazard values!") - - names(yBy) <- xBy - names(xBy) <- yBy - for (k in intersect(c("per", "age"), xBy)) { - yVar <- yBy[k] - kLo <- min(pophaz[[yVar]]) - kHi <- max(pophaz[[yVar]]) - mid <- l[, get(k) + lex.dur] - nLo <- sum(mid < kLo - .Machine$double.eps^0.5) - nHi <- sum(mid > kHi - .Machine$double.eps^0.5) - if (nLo > 0) message("WARNING: ", nLo, " rows in split data have NA values due to their mid-points residing below the minimum value of '", yVar, "' in pophaz!") - if (nHi > 0) message("NOTE: ", nHi, " rows in split data had values of '", k, "' higher than max of pophaz's '", yVar, "'; the hazard values at '", yVar, "' == ", kHi, " were used for these") - } - rm(mid) - for (k in yByOth) { - levsNotOth <- setdiff(unique(l[[k]]), unique(pophaz[[k]])) - if (length(levsNotOth) > 0) message("WARNING: following levels (first five) of variable '", k, "' not in pophaz but exist in split data: ", paste0("'",levsNotOth[1:5],"'", collapse = ", ")) - } - - - # pohar-perme weighting ---------------------------------------------------- - if (comp_pp) { - setkeyv(l, c("lex.id", "fot")) - comp_pp_weights(l, surv.scale = "fot", breaks = breaks$fot, haz = "pop.haz", - style = "delta", verbose = verbose) - } - merge_msg <- "Time taken by merging pophaz" - if (comp_pp) merge_msg <- paste0(merge_msg, " and computing pp") - merge_msg <- paste0(merge_msg, ": ") - if (verbose) cat(paste0(merge_msg, timetaken(pophaztime), "\n")) - - - } - - # dropping after merging ----------------------------------------------------- - if (drop_after) { - l <- intelliDrop(x = l, breaks = breaks) - } - - if (verbose) cat("Number of rows after splitting: ", nrow(l),"\n") - - - # aggregating if appropriate ------------------------------------------------- - if (agTy != "NULL") { - - setcolsnull(l, keep = c("lex.id","lex.dur", "fot", "per", "age", "lex.Cst", "lex.Xst", agVars, "pop.haz", "pp")) - - sumVars <- NULL - if ("pop.haz" %in% names(l)) { - if ("d.exp" %in% names(l)) stop("data had variable named 'd.exp' by which to aggregate, which would be overwritten due to aggregating expected numbers of cases (you have supplied pophaz AND are aggregating); please rename / remove it first.") - l[, c("d.exp") := pop.haz*lex.dur ] - sumVars <- c(sumVars, "d.exp") - } - if ("pop.haz" %in% names(l) && comp_pp && "pp" %in% names(l)) { - forceLexisDT(l, breaks = breaks, allScales = c("fot", "per", "age")) - ppFigs <- comp_pp_weighted_figures(lex = l, haz = "pop.haz", pp = "pp", event.ind = NULL) - bad_pp_vars <- intersect(names(ppFigs), names(l)) - if (length(bad_pp_vars) > 0L) { - bad_pp_vars <- paste0("'",bad_pp_vars, "'", collapse = ", ") - stop("Data had variable(s) named ", bad_pp_vars, ", by which to aggregate, which would be overwritten due to aggregating expected numbers of cases (you have supplied pophaz AND are aggregating); please rename / remove them first") - } - l[, names(ppFigs) := ppFigs] - sumVars <- c(sumVars, names(ppFigs)) - rm(ppFigs) - - } - - if (verbose) cat("Starting aggregation of split data... \n") - setDT(l) - forceLexisDT(l, allScales = c("fot", "per", "age"), breaks = breaks) - l <- try(aggre(lex = l, by = aggSub, type = aggre.type, verbose = verbose, sum.values = sumVars)) - if (inherits(l, "try-error")) stop("Something went wrong when calling aggre() within lexpand(). Usual suspect: bad 'by' argument. Error message from aggre(): - ", paste0(l[[1]])) - if (verbose) cat("Aggregation done. \n") - - if (!return_DT() && is.data.table(l)) setDFpe(l) - - } else { - - - # last touch-up -------------------------------------------------------------- - ## sometimes problems with releasing memory - gc() - - breaks <- lapply(c("fot","per","age"), function(ts_nm) { - breaks[[ts_nm]] - }) - names(breaks) <- c("fot","per","age") - - ## handle attributes - setkeyv(l, c("lex.id", "lex.multi")) - set(l, j = "lex.multi", value = NULL) - setattr(l, "time.scales", c("fot","per","age")) - setattr(l, "time.since", c("","","")) - setattr(l, "breaks", breaks) - setattr(l, "class", c("Lexis","data.table","data.frame")) - if (!return_DT() && is.data.table(l)) setDFpe(l) - - - - } - - if (verbose) cat("Time taken by lexpand(): ", timetaken(start_time), "\n") - - return(l[]) -} - - -globalVariables(c('.EACHI', "dg_date", "ex_date", "bi_date")) + +#' @title Split case-level observations +#' @author Joonas Miettinen +#' @description Given subject-level data, data is split +#' by calendar time (\code{per}), \code{age}, and follow-up +#' time (\code{fot}, from 0 to the end of follow-up) +#' into subject-time-interval rows according to +#' given \code{breaks} and additionally processed if requested. +#' @param data dataset of e.g. cancer cases as rows +#' @param birth birth time in date format +#' or fractional years; string, symbol or expression +#' @param entry entry time in date format +#' or fractional years; string, symbol or expression +#' @param exit exit from follow-up time in date +#' format or fractional years; string, symbol or expression +#' @param event advanced: time of possible event differing from \code{exit}; +#' typically only used in certain SIR/SMR calculations - see Details; +#' string, symbol or expression +#' @param status variable indicating type of event at \code{exit} or \code{event}; +#' e.g. \code{status = status != 0}; expression or quoted variable name +#' @param entry.status input in the same way as \code{status}; +#' status at \code{entry}; see Details +#' @param id optional; an id variable; e.g. \code{id = my_id}; +#' string, symbol or expression +#' @param overlapping advanced, logical; if \code{FALSE} AND if \code{data} contains +#' multiple rows per subject, +#' ensures that the timelines of \code{id}-specific rows do not overlap; +#' this ensures e.g. that person-years are only computed once per subject +#' in a multi-state paradigm +#' @param aggre e.g. \code{aggre = list(sex, fot)}; +#' a list of unquoted variables and/or expressions thereof, +#' which are interpreted as factors; data events and person-years will +#' be aggregated by the unique combinations of these; see Details +#' @param aggre.type one of \code{c("unique","cartesian")}; +#' can be abbreviated; see Details +#' @param breaks a named list of vectors of time breaks; +#' e.g. \code{breaks = list(fot=0:5, age=c(0,45,65,Inf))}; see Details +#' @param drop logical; if \code{TRUE}, drops all resulting rows +#' after splitting that reside outside +#' the time window as defined by the given breaks (all time scales) +#' @param pophaz a dataset of population hazards to merge +#' with split data; see Details +#' @param pp logical; if \code{TRUE}, computes Pohar-Perme weights using +#' \code{pophaz}; adds variable with reserved name \code{pp}; +#' see Details for computing method +#' @param subset a logical vector or any logical condition; data is subsetted +#' before splitting accordingly +#' @param merge logical; if \code{TRUE}, retains all +#' original variables from the data +#' @param verbose logical; if \code{TRUE}, the function is chatty and +#' returns some messages along the way +#' @param ... e.g. \code{fot = 0:5}; instead of specifying a \code{breaks} list, +#' correctly named breaks vectors can be given +#' for \code{fot}, \code{age}, and \code{per}; these override any breaks in the +#' \code{breaks} list; see Examples +#' +#' +#' +#' @details +#' \strong{Basics} +#' +#' \code{\link{lexpand}} splits a given data set (with e.g. cancer diagnoses +#' as rows) to subintervals of time over +#' calendar time, age, and follow-up time with given time breaks +#' using \code{\link{splitMulti}}. +#' +#' The dataset must contain appropriate +#' \code{Date} / \code{IDate} / \code{date} format or +#' other numeric variables that can be used +#' as the time variables. +#' +#' You may take a look at a simulated cohort +#' \code{\link{sire}} as an example of the +#' minimum required information for processing data with \code{lexpand}. +#' +#' Many arguments can be supplied as a character string naming the appropriate +#' variable (e.g. \code{"sex"}), as a symbol (e.g. \code{sex}) or as an expression +#' (e.g. \code{factor(sex, 0:1, c("m", "f"))}) for flexibility. +#' +#' \strong{Breaks} +#' +#' You should define all breaks as left inclusive and right exclusive +#' time points (e.g.\code{[a,b)} ) +#' for 1-3 time dimensions so that the last member of a breaks vector +#' is a meaningful "final upper limit", +#' e.g. \code{per = c(2002,2007,2012)} +#' to create a last subinterval of the form \code{[2007,2012)}. +#' +#' All breaks are explicit, i.e. if \code{drop = TRUE}, +#' any data beyond the outermost breaks points are dropped. +#' If one wants to have unspecified upper / lower limits on one time scale, +#' use \code{Inf}: e.g. \code{breaks = list(fot = 0:5, age = c(0,45,Inf))}. +#' Breaks for \code{per} can also be given in +#' \code{Date}/\code{IDate}/\code{date} format, whereupon +#' they are converted to fractional years before used in splitting. +#' +#' The \code{age} time scale can additionally +#' be automatically split into common age grouping schemes +#' by naming the scheme with an appropriate character string: +#' +#' \itemize{ +#' \item \code{"18of5"}: age groups 0-4, 5-9, 10-14, ..., 75-79, 80-84, 85+ +#' \item \code{"20of5"}: age groups 0-4, 5-9, 10-14, ..., 85-89, 90-94, 95+ +#' \item \code{"101of1"}: age groups 0, 1, 2, ..., 98, 99, 100+ +#' } +#' +#' \strong{Time variables} +#' +#' If any of the given time variables +#' (\code{birth}, \code{entry}, \code{exit}, \code{event}) +#' is in any kind of date format, they are first coerced to +#' fractional years before splitting +#' using \code{\link{get.yrs}} (with \code{year.length = "actual"}). +#' +#' Sometimes in e.g. SIR/SMR calculation one may want the event time to differ +#' from the time of exit from follow-up, if the subject is still considered +#' to be at risk of the event. If \code{event} is specified, the transition to +#' \code{status} is moved to \code{event} from \code{exit} +#' using \code{\link[Epi]{cutLexis}}. See Examples. +#' +#' \strong{The status variable} +#' +#' The statuses in the expanded output (\code{lex.Cst} and \code{lex.Xst}) +#' are determined by using either only \code{status} or both \code{status} +#' and \code{entry.status}. If \code{entry.status = NULL}, the status at entry +#' is guessed according to the type of variable supplied via \code{status}: +#' For numeric variables it will be zero, for factors the first level +#' (\code{levels(status)[1]}) and otherwise the first unique value in alphabetical +#' order (\code{sort(unique(status))[1]}). +#' +#' Using numeric or factor status +#' variables is strongly recommended. Logical expressions are also allowed +#' (e.g. \code{status = my_status != 0L}) and are converted to integer internally. +#' +#' \strong{Merging population hazard information} +#' +#' To enable computing relative/net survivals with \code{\link{survtab}} +#' and \code{\link{relpois}}, \code{lexpand} merges an appropriate +#' population hazard data (\code{pophaz}) to the expanded data +#' before dropping rows outside the specified +#' time window (if \code{drop = TRUE}). \code{pophaz} must, for this reason, +#' contain at a minimum the variables named +#' \code{agegroup}, \code{year}, and \code{haz}. \code{pophaz} may contain additional variables to specify +#' different population hazard levels in different strata; e.g. \code{popmort} includes \code{sex}. +#' All the strata-defining variables must be present in the supplied \code{data}. \code{lexpand} will +#' automatically detect variables with common names in the two datasets and merge using them. +#' +#' Currently \code{year} must be an integer variable specifying the appropriate year. \code{agegroup} +#' must currently also specify one-year age groups, e.g. \code{popmort} specifies 101 age groups +#' of length 1 year. In both +#' \code{year} and \code{agegroup} variables the values are interpreted as the lower bounds of intervals +#' (and passed on to a \code{cut} call). The mandatory variable \code{haz} +#' must specify the appropriate average rate at the person-year level; +#' e.g. \code{haz = -log(survProb)} where \code{survProb} is a one-year conditional +#' survival probability will be the correct hazard specification. +#' +#' The corresponding \code{pophaz} population hazard value is merged by using the mid points +#' of the records after splitting as reference values. E.g. if \code{age=89.9} at the start +#' of a 1-year interval, then the reference age value is \code{90.4} for merging. +#' This way we get a "typical" population hazard level for each record. +#' +#' \strong{Computing Pohar-Perme weights} +#' +#' If \code{pp = TRUE}, Pohar-Perme weights +#' (the inverse of cumulative population survival) are computed. This will +#' create the new \code{pp} variable in the expanded data. \code{pp} is a +#' reserved name and \code{lexpand} throws exception if a variable with that name +#' exists in \code{data}. +#' +#' When a survival interval contains one or several rows per subject +#' (e.g. due to splitting by the \code{per} scale), +#' \code{pp} is cumulated from the beginning of the first record in a survival +#' interval for each subject to the mid-point of the remaining time within that +#' survival interval, and that value is given for every other record +#' that a given person has within the same survival interval. +#' +#' E.g. with 5 rows of duration \code{1/5} within a survival interval +#' \code{[0,1)]}, \code{pp} is determined for all records by a cumulative +#' population survival from \code{0} to \code{0.5}. Th existing accuracy is used, +#' so that the weight is cumulated first up to the end of the second row +#' and then over the remaining distance to the mid-point (first to 0.4, then to +#' 0.5). This ensures that more accurately merged population hazards are fully +#' used. +#' +#' \strong{Event not at end of follow-up & overlapping time lines} +#' +#' \code{event} may be used if the event indicated by \code{status} should +#' occur at a time differing from \code{exit}. If \code{event} is defined, +#' \code{cutLexis} is used on the data set after coercing it to the \code{Lexis} +#' format and before splitting. Note that some values of \code{event} are allowed +#' to be \code{NA} as with \code{cutLexis} to accommodate observations +#' without an event occurring. +#' +#' Additionally, setting \code{overlapping = FALSE} ensures that (irrespective +#' of using \code{event}) the each subject defined by \code{id} only has one +#' continuous time line instead of possibly overlapping time lines if +#' there are multiple rows in \code{data} by \code{id}. +#' +#' +#' \strong{Aggregating} +#' +#' Certain analyses such as SIR/SMR calculations require tables of events and +#' person-years by the unique combinations (interactions) of several variables. +#' For this, \code{aggre} can be specified as a list of such variables +#' (preferably \code{factor} variables but not mandatory) +#' and any arbitrary functions of the +#' variables at one's disposal. E.g. +#' +#' \code{aggre = list(sex, agegr = cut(dg_age, 0:100))} +#' +#' would tabulate events and person-years by sex and an ad-hoc age group +#' variable. Every ad-hoc-created variable should be named. +#' +#' \code{fot}, \code{per}, and \code{age} are special reserved variables which, +#' when present in the \code{aggre} list, are output as categories of the +#' corresponding time scale variables by using +#' e.g. +#' +#' \code{cut(fot, breaks$fot, right=FALSE)}. +#' +#' This only works if +#' the corresponding breaks are defined in \code{breaks} or via "\code{...}". +#' E.g. +#' +#' \code{aggre = list(sex, fot.int = fot)} with +#' +#' \code{breaks = list(fot=0:5)}. +#' +#' The output variable \code{fot.int} in the above example will have +#' the lower limits of the appropriate intervals as values. +#' +#' \code{aggre} as a named list will output numbers of events and person-years +#' with the given new names as categorizing variable names, e.g. +#' \code{aggre = list(follow_up = fot, gender = sex, agegroup = age)}. +#' +#' The output table has person-years (\code{pyrs}) and event counts +#' (e.g. \code{from0to1}) as columns. Event counts are the numbers of transitions +#' (\code{lex.Cst != lex.Xst}) or the \code{lex.Xst} value at a subject's +#' last record (subject possibly defined by \code{id}). +#' +#' If \code{aggre.type = "unique"} (alias \code{"non-empty"}), +#' the above results are computed for existing +#' combinations of expressions given in \code{aggre}, but also for non-existing +#' combinations if \code{aggre.type = "cartesian"} (alias \code{"full"}). E.g. if a +#' factor variable has levels \code{"a", "b", "c"} but the data is limited +#' to only have levels \code{"a", "b"} present +#' (more than zero rows have these level values), the former setting only +#' computes results for \code{"a", "b"}, and the latter also for \code{"c"} +#' and any combination with other variables or expression given in \code{aggre}. +#' In essence, \code{"cartesian"} forces also combinations of variables used +#' in \code{aggre} that have no match in data to be shown in the result. +#' +#' If \code{aggre} is not \code{NULL} and \code{pophaz} has been supplied, +#' \code{lexpand} also aggregates the expected counts of events, which +#' appears in the output data by the reserved name \code{d.exp}. Additionally, +#' having \code{pp = TRUE} causes \code{lexpand} to also compute various +#' Pohar-Perme weighted figures necessary for computing Pohar-Perme net survivals +#' with \code{\link{survtab_ag}}. This can be slow, so consider what is really +#' needed. The Pohar-Perme weighted figures have the suffix \code{.pp}. +#' +#' @return +#' If \code{aggre = NULL}, returns +#' a \code{data.table} or \code{data.frame} +#' (depending on \code{options("popEpi.datatable")}; see \code{?popEpi}) +#' object expanded to accommodate split observations with time scales as +#' fractional years and \code{pophaz} merged in if given. Population +#' hazard levels in new variable \code{pop.haz}, and Pohar-Perme +#' weights as new variable \code{pp} if requested. +#' +#' If \code{aggre} is defined, returns a long-format +#' \code{data.table}/\code{data.frame} with the variable \code{pyrs} (person-years), +#' and variables for the counts of transitions in state or state at end of +#' follow-up formatted \code{fromXtoY}, where \code{X} and \code{Y} are +#' the states transitioned from and to, respectively. The data may also have +#' the columns \code{d.exp} for expected numbers of cases and various +#' Pohar-Perme weighted figures as identified by the suffix \code{.pp}; see +#' Details. +#' +#' +#' @examples +#' \dontrun{ +#' ## prepare data for e.g. 5-year cohort survival calculation +#' x <- lexpand(sire, breaks=list(fot=seq(0, 5, by = 1/12)), +#' birth = bi_date, entry = dg_date, exit = ex_date, +#' status = status != 0, pophaz=popmort) +#' +#' ## prepare data for e.g. 5-year "period analysis" for 2008-2012 +#' BL <- list(fot = seq(0, 5, by = 1/12), per = c("2008-01-01", "2013-01-01")) +#' x <- lexpand(sire, breaks = BL, +#' birth = bi_date, entry = dg_date, exit = ex_date, +#' pophaz=popmort, status = status != 0) +#' +#' ## aggregating +#' BL <- list(fot = 0:5, per = c("2003-01-01","2008-01-01", "2013-01-01")) +#' ag <- lexpand(sire, breaks = BL, status = status != 0, +#' birth = bi_date, entry = dg_date, exit = ex_date, +#' aggre=list(sex, period = per, surv.int = fot)) +#' +#' ## aggregating even more +#' ag <- lexpand(sire, breaks = BL, status = status != 0, +#' birth = bi_date, entry = dg_date, exit = ex_date, +#' aggre=list(sex, period = per, surv.int = fot), +#' pophaz = popmort, pp = TRUE) +#' +#' ## using "..." +#' x <- lexpand(sire, fot=0:5, status = status != 0, +#' birth = bi_date, entry = dg_date, exit = ex_date, +#' pophaz=popmort) +#' +#' x <- lexpand(sire, fot=0:5, status = status != 0, +#' birth = bi_date, entry = dg_date, exit = ex_date, +#' aggre=list(sex, surv.int = fot)) +#' +#' ## using the "event" argument: it just places the transition to given "status" +#' ## at the "event" time instead of at the end, if possible using cutLexis +#' x <- lexpand(sire, status = status, event = dg_date, +#' birth = bi_date, entry = dg_date, exit = ex_date,) +#' +#' ## aggregating with custom "event" time +#' ## (the transition to status is moved to the "event" time) +#' x <- lexpand(sire, status = status, event = dg_date, +#' birth = bi_date, entry = dg_date, exit = ex_date, +#' per = 1970:2014, age = c(0:100,Inf), +#' aggre = list(sex, year = per, agegroup = age)) +#' +#' } +#' +#' @import data.table +#' @import Epi +#' @family splitting functions +#' @family aggregation functions +#' @seealso +#' \code{\link[Epi]{Lexis}}, \code{\link{popmort}} +#' @export +lexpand <- function(data, + birth=NULL, entry=NULL, exit=NULL, event=NULL, + status = status != 0, + entry.status = NULL, + breaks = list(fot=c(0,Inf)), + id = NULL, + overlapping = TRUE, + aggre = NULL, + aggre.type = c("unique", "cartesian"), + drop=TRUE, + pophaz = NULL, pp = TRUE, + subset = NULL, + merge=TRUE, verbose = FALSE, + ...) { + start_time <- proc.time() + + TF <- environment() + PF <- parent.frame(1L) + + ## data checks + if ( missing(data) || nrow(data) == 0) stop("no data found") + + if (!is.data.frame(data)) stop("data must be a data.frame or data.table") + + ## to instate global variables to appease R CMD CHECK + .EACHI <- lex.status <- lexpand.id <- lex.exit <- lex.birth <- + lex.entry <- lex.event <- temp.id <- cd <- fot <- age <- per <- + lex.id <- lex.multi <- pop.haz <- lex.Cst <- lex.Xst <- lex.dur <- NULL + + + ## test conflicting variable names ------------------------------------------- + added_vars <- c("fot", "per", "age", "lex.id", "lex.dur", "lex.Xst", "lex.Cst") + if (!is.null(pophaz)) added_vars <- if (pp) c(added_vars, "pp", "pop.haz") else c(added_vars, "pop.haz") + conflicted_vars <- intersect(added_vars, names(data)) + + if (merge && length(conflicted_vars) > 0) { + conflicted_vars <- paste0("'", conflicted_vars, "'", collapse = ", ") + warning("'data' already had variable(s) named ", conflicted_vars, " which lexpand will create, and you have merge = TRUE; this may result in unexpected problems. Rename the variable(s)?") + } + rm(added_vars, conflicted_vars) + + ## test aggre type ----------------------------------------------------------- + aggre.type <- match.arg(aggre.type[1L], c("cartesian", "non-empty", "unique", "cross-product", "full")) + if (aggre.type == "cross-product") { + aggre.type <- "cartesian" + warning("aggre.type value 'cross-product' deprecated and renamed to 'cartesian'; please use that in the future") + } + + ## subsetting----------------------------------------------------------------- + ## no copy taken of data! + subset <- substitute(subset) + subset <- evalLogicalSubset(data, subset) + + ## prepping time variables --------------------------------------------------- + l <- substitute(list(birth, entry, exit, event, status, entry.status, id)) + rm(birth, entry, exit, event, status, entry.status, id) + + lc <- unlist(lapply(l, deparse)) + lc <- lc[-1] ## first always "list" + + wh <- which(lc != "NULL") + lex_vars <- c("lex.birth","lex.entry","lex.exit","lex.event", "lex.status", "lex.entry.status", "lexpand.id")[wh] + if (any(!c("lex.birth", "lex.entry", "lex.exit", "lex.status") %in% lex_vars)) stop("birth, entry, exit and status are mandatory") + + l <- eval(l, envir = data[subset, ], enclos = PF) + l[-wh] <- NULL + + + ## vars can be given as character strings of variable names + isChar <- sapply(l, is.character, simplify = TRUE) + if (any(isChar)) { + isShort <- sapply(l, function(x) {length(x) == 1L}, simplify = TRUE) + whOneChar <- which(isShort & isChar) + + whBadChar <- NULL + if (length(whOneChar) > 0) { + testBadChar <- unlist(l[whOneChar]) + whBadChar <- whOneChar[!testBadChar %in% names(data)] + } + + if (length(whBadChar) > 0) { + + badChar <- l[whBadChar] + badChar <- paste0(badChar[1:min(length(badChar), 5L)], collapse = ", ") + stop("Variables given as a character of length one are interpreted as variable names in data, + but some given characters were not found in data; + check names or input as factor/Date; + first five bad names: ", badChar) + } + + l[whOneChar] <- lapply(l[whOneChar], function(x) {data[subset, ][[x]]}) + } + + + l <- as.data.table(l) + setnames(l, names(l), lex_vars) + + + rm(lex_vars) + if (!all(c("lex.birth","lex.entry","lex.exit","lex.status") %in% names(l))) { + stop("birth, entry, exit and status are mandatory, but at least one was misspecified/NULL") + } + + if (is.logical(l$lex.status)) l[, lex.status := as.integer(lex.status)] + if (is.null(l$lexpand.id)) l[, lexpand.id := 1:.N] + + ## checks for merging style -------------------------------------------------- + if (!is.null(pophaz)) { + all_names_present(pophaz, c("agegroup","year","haz")) + othMergeVars <- setdiff(names(pophaz), c("agegroup","year","haz")) + badOthMergeVars <- setdiff(othMergeVars, names(data)) + if (length(badOthMergeVars) > 0) { + badOthMergeVars <- paste0("'", badOthMergeVars, "'", collapse = ", ") + stop("Following variables exist in pophaz but do not exist in data: ", badOthMergeVars, ". Make sure data and pophaz contain variables with the same names that you intend to merge by.") + } + } + + if (is.null(pophaz)) { + comp_pp <- FALSE + } else { + comp_pp <- TRUE + } + + ## internally we have "delta" and "actual" methods, + ## the latter being experimental and not visible in the documentation. + ## "delta" is always used if pp = TRUE. + ## it is possible to choose pp = "actual" as well, though, if you know + ## about it. + comp_pp <- FALSE + if (is.logical(pp) && pp) pp <- "delta" + + if (!is.null(pp) && is.character(pp)) { + pp <- match.arg(pp, c("delta", "actual")) + comp_pp <- TRUE + } + if (comp_pp && "pp" %in% names(data)) stop("variable named 'pp' in data; this is a reserved name for pohar-perme weights, please rename / remove the variable in data") + + ## ensure given breaks make any sense ---------------------------------------- + + bl <- list(...) + lna <- names(bl) + bad_lna <- setdiff(lna, c("fot","per","age")) + if (length(bad_lna) > 0) { + bad_lna <- paste0("'", bad_lna, "'", collapse = ", ") + stop("only arguments named 'fot', 'per' or 'age' currently allowed to be passed via '...'; did you mistype an argument? bad args: ", bad_lna) + } + lna <- intersect(names(bl), c("fot","per","age")) + if (length(lna) > 0) { + bl <- bl[lna] + if (!is.null(breaks)) breaks[lna] <- NULL + breaks <- c(breaks, bl) + } + rm(bl, lna) + + brna <- names(breaks) + if (length(brna) != length(breaks)) { + stop("all elements in breaks list must be named, e.g. list(fot = 0:5, age=c(0,45,65,Inf))") + } + + brna <- intersect(brna, c("fot","per","age")) + if (length(brna) == 0) { + breaks$fot <- c(0,Inf) + } + + if ("age" %in% brna && is.character(breaks$age)) { + schemeNames <- c("18of5", "20of5", "101of1") + if (!breaks$age %in% schemeNames) stop("You supplied '", breaks$age, "' as breaks for the age scale, but allowed character strings are: ", paste0("'", schemeNames, "'", collapse = ",")) + brSchemes <- list(c(seq(0, 85, 5)), c(seq(0, 95, 5), Inf), c(0:100, Inf)) + names(brSchemes) <- paste0("age_", schemeNames) + breaks$age <- brSchemes[paste0("age_",breaks$age)] + } + + + if (any(sapply(breaks, length) == 1L)) { + stop("any given non-null vector of breaks must have more than one break!") + } + + # convert to fractional years ------------------------------------------------ + + char2date <- function(obj) { + if (is.character(obj) || inherits(obj, "date")) { + return(as.IDate(obj)) + } else { + return(obj) + } + } + + date2yrs <- function(obj) { + if (is.Date(obj) || inherits(obj, "date")) { + get.yrs(obj, year.length = "actual") + } else { + obj + } + } + + breaks <- lapply(breaks, char2date) + breaks <- lapply(breaks, date2yrs) + + time_vars <- intersect(names(l), c("lex.birth", "lex.entry", "lex.exit","lex.event")) + l[, (time_vars) := lapply(.SD, date2yrs) , .SDcols = time_vars] + + if (verbose) cat("given birth, entry, exit, status etc. variables after coercion to numeric \n") + if (verbose) print(l) + + # check data consistency for overlapping = FALSE ----------------------------- + ## not allowed: for any one unique subject to be true for + ## multiple rows (if overlapping = TRUE): + ## * same event values + ## * same entry values + if (!overlapping) { + if ("lex.event" %in% names(l)) { + if (all(is.na(l$lex.event))) stop("ALL 'event' values are NA; if this is as intended, please use event = NULL instead") + + if (any(duplicated(l, by = c("lexpand.id", "lex.event")))) { + stop("subject(s) defined by lex.id had several rows where 'event' time had the same value, which is not supported with overlapping = FALSE; perhaps separate them by one day?") + } + if (any(l[!is.na(lex.event), lex.entry == lex.event])) { + stop("some rows have simultaneous 'entry' and 'event', which is not supported with overlapping = FALSE; perhaps separate them by one day?") + } + } else if (any(duplicated(l, by = c("lexpand.id", "lex.exit")))) { + stop("subject(s) defined by lex.id had several rows where 'exit' time had the same value, which is not supported without 'event' defined; use 'event' or perhaps separate them by one day?") + } + + + } + + + # dropping unuseful records -------------------------------------------------- + test_times <- function(condition, msg, old_subset=l_subset, DT=l) { + + condition <- substitute(condition) + condition <- eval(condition, envir = DT, enclos = parent.frame(1L)) + + new_subset <- old_subset & !(condition & !is.na(condition)) + old_n <- sum(old_subset) + new_n <- sum(new_subset) + + if (new_n == 0L) { + stop("dropping rows where ", msg, " resulted in zero rows left. likely problem: misdefined time variables") + } + + if (new_n < old_n) { + message(paste0("dropped ", old_n-new_n, " rows where ", msg)) + } + return(new_subset) + } + + l_subset <- rep(TRUE, nrow(l)) + + l_subset <- test_times(is.na(lex.birth), "birth values are missing") + l_subset <- test_times(is.na(lex.entry), "entry values are missing") + l_subset <- test_times(is.na(lex.exit), "exit values are missing") + + if (!is.null(breaks$per)) { + l_subset <- test_times(lex.exit < min(breaks$per), "subjects left follow-up before earliest per breaks value") + } + if (!is.null(breaks$age)) { + l_subset <- test_times(lex.exit - lex.birth < min(breaks$age), "subjects left follow-up before lowest age breaks value") + } + if (!is.null(breaks$fot)) { + l_subset <- test_times(lex.exit - lex.entry < min(breaks$fot), "subjects left follow-up before lowest fot breaks value") + } + l_subset <- test_times(lex.birth >= lex.exit, "birth >= exit") + l_subset <- test_times(lex.entry == lex.exit, "entry == exit") + l_subset <- test_times(lex.entry > lex.exit, "entry > exit") + l_subset <- test_times(lex.birth > lex.entry, "birth > entry") + if (!is.null(l$lex.event)) { + l_subset <- test_times(lex.event > lex.exit, "event > exit") + l_subset <- test_times(lex.event < lex.entry, "event < entry") + } + l <- l[l_subset] + + if (verbose) cat("Time taken by checks, prepping and test: ", timetaken(start_time), "\n") + + # Lexis coercion ------------------------------------------------------------- + + ## status definitions + setnames(l, "lex.status", "lex.Xst") + if ("lex.entry.status" %in% names(l)) { + setnames(l, "lex.entry.status", "lex.Cst") + } else { + if (is.factor(l$lex.Xst)) { + l[, lex.Cst := factor(levels(lex.Xst)[1L], levels=levels(lex.Xst))] + } else if (is.double(l$lex.Xst)) { + l[, lex.Cst := 0] + } else if (is.integer(l$lex.Xst)) { + l[, lex.Cst := 0L] + } else { + l[, lex.Cst := sort(unique(lex.Xst))[1L]] + } + + } + + # ensure common labels for factors etc. + harmonizeStatuses(x = l, C = "lex.Cst", X = "lex.Xst") + + ## time scales and duration + l[, lex.dur := lex.exit - lex.entry] + l[, fot := 0] + setnames(l, "lex.entry", "per") + l[, age := per-lex.birth] + setnames(l, "lexpand.id", "lex.id") + + ## for merging data with l later + if (merge) { + idt <- data.table(temp.id = 1:nrow(l)) + l[, temp.id := 1:.N] + } + + ## crop time scale values to obey breaks limits and drop if necessary + ## NOTE: goes wrong if need to compute pp weights! + # if (drop && !pp) { + # intelliCrop(x = l, breaks = breaks, allScales = c("fot", "per", "age"), cropStatuses = TRUE) + # l <- intelliDrop(x = l, breaks = breaks, dropNegDur = TRUE) + # } + + + setcolsnull(l, colorder=TRUE, soft=TRUE, + keep = c("lex.id","fot","per","age", + "lex.dur", "lex.Cst", "lex.Xst", "lex.event", "temp.id")) + setattr(l, "class", c("Lexis", "data.table", "data.frame")) + setattr(l, "time.scales", c("fot","per","age")) + setattr(l, "time.since", c("","","")) + + if (verbose) cat("data just after Lexis coercion: \n") + if (verbose) print(l) + + # event not at exit time ----------------------------------------------------- + + if ("lex.event" %in% names(l)) { + + if (!overlapping) { + + ## using lex.event time, ensure coherence of lex.Cst & lex.Xst + ## before cutLexis() + tmpFE <- makeTempVarName(l, pre = "fot_end_") + l[, (tmpFE) := fot + lex.dur] + setkeyv(l, c("lex.id", "lex.event", tmpFE)) + tmpLX <- makeTempVarName(l, pre = "lag_lex.Xst_") + l[, (tmpLX) := shift(lex.Xst, n = 1, type = "lag"), by = lex.id] + l[!is.na(get(tmpLX)), lex.Cst := get(tmpLX)] + l[, c(tmpFE, tmpLX) := NULL] + rm(tmpFE, tmpLX) + + } + + if (verbose) cutt <- proc.time() + setDF(l) + setattr(l, "class", c("Lexis", "data.frame")) + l <- Epi::cutLexis(l, cut = l$lex.event, timescale = "per", new.state = l$lex.Xst, precursor.states = unique(l$lex.Cst)) + setDT(l) + setattr(l, "class", c("Lexis", "data.table", "data.frame")) + if (verbose) cat("Time taken by cutLexis when defining event time points: ", timetaken(cutt), "\n") + + if (verbose) cat("Data just after using cutLexis: \n") + if (verbose) print(l[]) + + } + + + # overlapping timelines? ----------------------------------------------------- + + if (!overlapping && any(duplicated(l$lex.id))) { + tmpFE <- makeTempVarName(l, pre = "fot_end_") + l[, (tmpFE) := fot + lex.dur] + ## don't keep duplicated rows: + ## same end points imply fully overlapping time lines + ## e.g. + ## ---> + ## -> + ## --> + ## results in + ## -> + ## ---> + ## we only keep the longest time line with a unique end point. + + # setkeyv(l, c("lex.id", tmpFE, "fot")) + tmpLE <- intersect(names(l), "lex.event") + LEval <- if (length(tmpLE) == 0) NULL else -1 + + setorderv(l, c("lex.id", tmpFE, tmpLE, "fot"), c(1,1,LEval,1)) + l <- unique(l, by = c("lex.id", tmpFE)) + + ## end points are kept but starting points are "rolled" + ## from first to last row by lex.id to ensure non-overlappingness; e.g. + ## -> + ## ---> + ## results in + ## -> + ## -> + # setkeyv(l, c("lex.id", tmpFE)) + # setorderv(l, c("lex.id", tmpLE, tmpFE), c(1, LEval, 1)) + setkeyv(l, c("lex.id", tmpLE, tmpFE)) + + if (verbose) cat("data just before fixing overlapping time lines \n") + if (verbose) print(l) + l[, lex.dur := get(tmpFE) - c(min(fot), get(tmpFE)[-.N]), by = lex.id] + l[, fot := get(tmpFE) - lex.dur] + cumDur <- l[, list(age = min(age), per = min(per), cd = c(0, cumsum(lex.dur)[-.N])), by = lex.id] + cumDur[, age := age+cd] + cumDur[, per := per+cd] + l[, age := cumDur$age] + l[, per := cumDur$per] + l[, (tmpFE) := NULL]; rm(cumDur) + + + ## if event used, first row up to event, second row from first event to etc... + } + + setcolsnull(l, "lex.event", soft = TRUE) ## note: lex.event needed in overlapping procedures + + if (verbose) cat("time and status variables before splitting: \n") + if (verbose) print(l) + if ("id" %in% ls()) rm("id") + + + # splitting ------------------------------------------------------------------ + + ## determine whether to drop data only after splitting and merging + drop_after <- FALSE + if (drop == TRUE && comp_pp) { + drop <- FALSE + drop_after <- TRUE + } + + forceLexisDT(l, breaks = list(fot = NULL, per = NULL, age = NULL), + allScales = c("fot", "per", "age")) + if (verbose) splittime <- proc.time() + l <- splitMulti(l, breaks = breaks, + drop = drop, verbose=FALSE, merge = TRUE) + setDT(l) + setkey(l, lex.id, fot) + l[, lex.multi := 1:.N, by = lex.id] + if (verbose) cat("Time taken by splitting:", timetaken(splittime), "\n") + + # merging other variables from data ------------------------------------------ + + if (merge) { + setkey(l, temp.id) + + temp <- data.table(idt, data[subset & !is.na(subset), ][l_subset, ]) + setkey(temp, temp.id) + + l <- temp[l] + + rm(temp, idt) + setcolsnull(l, "temp.id") + + lex_vars <- c("lex.id","lex.multi","fot","per","age", "lex.dur", "lex.Cst", "lex.Xst") + setcolorder(l, c(lex_vars, setdiff(names(l), lex_vars))) + } + rm(data, subset, l_subset) + + ## aggregating checks -------------------------------------------------------- + ## NOTE: aggre evaled here using small data subset to check that all needed + ## variables are found, etc. + aggSub <- substitute(aggre) + agTest <- evalPopArg(arg = aggSub, data = l[1:min(10L, .N), ], + enclos = PF, recursive = TRUE, DT = TRUE) + agTy <- attr(agTest, "arg.type") + if (is.null(agTy)) agTy <- "NULL" + aggSub <- attr(agTest, "quoted.arg") + agVars <- attr(agTest, "all.vars") + rm(aggre) + + # merging pophaz and pp-weighting -------------------------------------------- + if (!is.null(pophaz)) { + + pophaztime <- proc.time() + + if (any(c("haz", "pop.haz") %in% names(l))) stop("case data had variable(s) named 'haz' / 'pop.haz', which are reserved for lexpand's internal use. rename/remove them please.") + # merge surv.int information ----------------------------------------------- + NULL_FOT <- FALSE + if (is.null(breaks$fot)) { + breaks$fot <- l[, c(0, max(fot+lex.dur))] + NULL_FOT <- TRUE + } + + breaks$fot <- sort(unique(breaks$fot)) + # handle pophaz data ------------------------------------------------------- + + if (!"haz" %in% names(pophaz)) stop("no 'haz' variable in pophaz; please rename you hazard variable to 'haz'") + yBy <- xBy <- setdiff(names(pophaz), c("haz")) + if (c("year") %in% yBy) xBy[yBy == "year"] <- "per" + if (c("agegroup") %in% yBy) xBy[yBy == "agegroup"] <- "age" + yByOth <- setdiff(yBy, c("year", "agegroup")) + + if (any(!yByOth %in% names(l))) + stop("Following variable names not common between pophaz and data: ", paste0("'", yByOth[!yByOth %in% names(l)], "'", collapse = ", ")) + + l <- cutLowMerge(x = l, y = pophaz, by.x = xBy, by.y = yBy, all.x = TRUE, + all.y = FALSE, mid.scales = c("per", "age"), old.nums = TRUE) + setnames(l, "haz", "pop.haz") + + ## check if l's merging time variables were within pophaz's limits --------- + nNA <- l[is.na(pop.haz), .N] + if (nNA > 0) message("WARNING: after merging pophaz, ", nNA, " rows in split data have NA hazard values!") + + names(yBy) <- xBy + names(xBy) <- yBy + for (k in intersect(c("per", "age"), xBy)) { + yVar <- yBy[k] + kLo <- min(pophaz[[yVar]]) + kHi <- max(pophaz[[yVar]]) + mid <- l[, get(k) + lex.dur] + nLo <- sum(mid < kLo - .Machine$double.eps^0.5) + nHi <- sum(mid > kHi - .Machine$double.eps^0.5) + if (nLo > 0) message("WARNING: ", nLo, " rows in split data have NA values due to their mid-points residing below the minimum value of '", yVar, "' in pophaz!") + if (nHi > 0) message("NOTE: ", nHi, " rows in split data had values of '", k, "' higher than max of pophaz's '", yVar, "'; the hazard values at '", yVar, "' == ", kHi, " were used for these") + } + rm(mid) + for (k in yByOth) { + levsNotOth <- setdiff(unique(l[[k]]), unique(pophaz[[k]])) + if (length(levsNotOth) > 0) message("WARNING: following levels (first five) of variable '", k, "' not in pophaz but exist in split data: ", paste0("'",levsNotOth[1:5],"'", collapse = ", ")) + } + + + # pohar-perme weighting ---------------------------------------------------- + if (comp_pp) { + setkeyv(l, c("lex.id", "fot")) + comp_pp_weights(l, surv.scale = "fot", breaks = breaks$fot, haz = "pop.haz", + style = "delta", verbose = verbose) + } + merge_msg <- "Time taken by merging pophaz" + if (comp_pp) merge_msg <- paste0(merge_msg, " and computing pp") + merge_msg <- paste0(merge_msg, ": ") + if (verbose) cat(paste0(merge_msg, timetaken(pophaztime), "\n")) + + + } + + # dropping after merging ----------------------------------------------------- + if (drop_after) { + l <- intelliDrop(x = l, breaks = breaks) + } + + if (verbose) cat("Number of rows after splitting: ", nrow(l),"\n") + + + # aggregating if appropriate ------------------------------------------------- + if (agTy != "NULL") { + + setcolsnull(l, keep = c("lex.id","lex.dur", "fot", "per", "age", "lex.Cst", "lex.Xst", agVars, "pop.haz", "pp")) + + sumVars <- NULL + if ("pop.haz" %in% names(l)) { + if ("d.exp" %in% names(l)) stop("data had variable named 'd.exp' by which to aggregate, which would be overwritten due to aggregating expected numbers of cases (you have supplied pophaz AND are aggregating); please rename / remove it first.") + l[, c("d.exp") := pop.haz*lex.dur ] + sumVars <- c(sumVars, "d.exp") + } + if ("pop.haz" %in% names(l) && comp_pp && "pp" %in% names(l)) { + forceLexisDT(l, breaks = breaks, allScales = c("fot", "per", "age")) + ppFigs <- comp_pp_weighted_figures(lex = l, haz = "pop.haz", pp = "pp", event.ind = NULL) + bad_pp_vars <- intersect(names(ppFigs), names(l)) + if (length(bad_pp_vars) > 0L) { + bad_pp_vars <- paste0("'",bad_pp_vars, "'", collapse = ", ") + stop("Data had variable(s) named ", bad_pp_vars, ", by which to aggregate, which would be overwritten due to aggregating expected numbers of cases (you have supplied pophaz AND are aggregating); please rename / remove them first") + } + l[, names(ppFigs) := ppFigs] + sumVars <- c(sumVars, names(ppFigs)) + rm(ppFigs) + + } + + if (verbose) cat("Starting aggregation of split data... \n") + setDT(l) + forceLexisDT(l, allScales = c("fot", "per", "age"), breaks = breaks) + l <- try(aggre(lex = l, by = aggSub, type = aggre.type, verbose = verbose, sum.values = sumVars)) + if (inherits(l, "try-error")) stop("Something went wrong when calling aggre() within lexpand(). Usual suspect: bad 'by' argument. Error message from aggre(): + ", paste0(l[[1]])) + if (verbose) cat("Aggregation done. \n") + + if (!return_DT() && is.data.table(l)) setDFpe(l) + + } else { + + + # last touch-up -------------------------------------------------------------- + ## sometimes problems with releasing memory + gc() + + breaks <- lapply(c("fot","per","age"), function(ts_nm) { + breaks[[ts_nm]] + }) + names(breaks) <- c("fot","per","age") + + ## handle attributes + setkeyv(l, c("lex.id", "lex.multi")) + set(l, j = "lex.multi", value = NULL) + setattr(l, "time.scales", c("fot","per","age")) + setattr(l, "time.since", c("","","")) + setattr(l, "breaks", breaks) + setattr(l, "class", c("Lexis","data.table","data.frame")) + if (!return_DT() && is.data.table(l)) setDFpe(l) + + + + } + + if (verbose) cat("Time taken by lexpand(): ", timetaken(start_time), "\n") + + return(l[]) +} + + +globalVariables(c('.EACHI', "dg_date", "ex_date", "bi_date")) diff --git a/R/lexpand2.R b/R/lexpand2.R index e06813a..c0b0420 100644 --- a/R/lexpand2.R +++ b/R/lexpand2.R @@ -1,139 +1,139 @@ - -lexpand2 <- function(data, - birth = NULL, - entry = NULL, - exit = NULL, - status = NULL, - entry.status = NULL, - id = NULL, - breaks = NULL, - drop = TRUE, - pophaz = NULL, - pp = TRUE, - subset = NULL, - merge = TRUE, - aggre = NULL, - aggre.type = "cartesian", - verbose = FALSE, - ...) { - # @param data a data set - # @param merge passed on to \code{\link{Lexis_fpa}} - # @param aggre passed on to \code{\link{aggre}} - # @param aggre.type passed on to \code{\link{aggre}} - # @param drop passed on to \code{\link{splitMulti}} - # @param breaks passed on to \code{\link{splitMulti}} - # @examples - # - # lex <- lexpand2(sire, - # birth = "bi_date", - # entry = dg_date, - # exit = ex_date + 1L, - # status = "status") - # - - TF <- environment() - PF <- parent.frame(1L) - - ## various checks ------------------------------------------------------------ - - allScales <- c("fot", "per", "age") - reserved <- c(allScales, paste0("lex.", c("dur", "Xst", "Cst", "id"))) - if (!is.null(pophaz)) { - reserved <- c(reserved, "d.exp", "pop.haz") - if (pp) reserved <- c(reserved, "pp") - } - - - subs <- substitute(subset) - subset <- evalLogicalSubset(data, subs, enclos = PF) - - stopifnot(is.logical(verbose)) - stopifnot(is.logical(drop)) - stopifnot(is.logical(merge)) - stopifnot(is.logical(pp)) - - ## create Lexis object ------------------------------------------------------- - - lexCols <- c("birth", "entry", "exit", "status", "entry.status", "id") - al <- lapply(lexCols, function(stri) { - - e <- paste0("substitute(", stri, ", env = TF)") - e <- eval(parse(text = e), envir = TF) - - }) - names(al) <- lexCols - - - al <- lapply(al, function(expr) { - - evalPopArg(data = data, arg = expr, DT = TRUE, enclos = PF, - types = c("NULL", "character", "expression")) - - }) - - - al[sapply(names(al), function(stri) { - stri %in% c("id", "entry.status") && is.null(al[[stri]]) - })] <- NULL - - al <- lapply(al, function(elem) { - if (is.data.frame(elem)) return(elem[[1]]) - elem - }) - - al <- c(al, list(data = data, subset = subset)) - names(al)[names(al) == "status"] <- "exit.status" - - al$merge <- merge - - x <- do.call(Lexis_fpa, args = al) - - setDT(x) - setattr(x, "class", c("Lexis", "data.table", "data.frame")) - - - ## more checks --------------------------------------------------------------- - if (!is.null(breaks)) checkBreaksList(x, breaks = breaks) - pophaz <- data.table(pophaz) - alt_phna <- c("year" = "per", "agegroup" = "age") - lapply(names(alt_phna), function(stri) { - if (stri %in% names(pophaz)) setnames(pophaz, stri, alt_phna[stri]) - }) - if (!is.null(pophaz)) checkPophaz(x, pophaz) - - - ## splitting if needed ------------------------------------------------------- - if (!is.null(breaks)) { - x <- splitMulti(x, breaks = breaks, drop = drop) - breaks <- attr(x, "breaks") - } - - ## merge in pophaz if needed ------------------------------------------------- - if (!is.null(pophaz)) { - setnames(pophaz, "haz", "pop.haz") - - phScales <- intersect(allScales, names(pophaz)) - if (!length(phScales)) phScales <- NULL - x <- cutLowMerge(x, pophaz, by = intersect(names(x), names(pophaz)), - all.x = TRUE, all.y = FALSE, mid.scales = phScales, - old.nums = TRUE) - - - if (pp) { - x <- comp_pp_weights(x, surv.scale = "fot", - breaks = breaks, - haz = "pop.haz") - pp_cols <- comp_pp_weighted_figures(x, haz = "pop.haz", - pp = "pp", - by = "lex.id") - } - - } - - ## aggregate if needed ------------------------------------------------------- - ags <- substitute(aggre) - ags_test <- evalRecursive(ags, TF, baseenv()) - if (!is.null(ags_test$arg)) x <- aggre(x, by = ags) - - x -} + +lexpand2 <- function(data, + birth = NULL, + entry = NULL, + exit = NULL, + status = NULL, + entry.status = NULL, + id = NULL, + breaks = NULL, + drop = TRUE, + pophaz = NULL, + pp = TRUE, + subset = NULL, + merge = TRUE, + aggre = NULL, + aggre.type = "cartesian", + verbose = FALSE, + ...) { + # @param data a data set + # @param merge passed on to \code{\link{Lexis_fpa}} + # @param aggre passed on to \code{\link{aggre}} + # @param aggre.type passed on to \code{\link{aggre}} + # @param drop passed on to \code{\link{splitMulti}} + # @param breaks passed on to \code{\link{splitMulti}} + # @examples + # + # lex <- lexpand2(sire, + # birth = "bi_date", + # entry = dg_date, + # exit = ex_date + 1L, + # status = "status") + # + + TF <- environment() + PF <- parent.frame(1L) + + ## various checks ------------------------------------------------------------ + + allScales <- c("fot", "per", "age") + reserved <- c(allScales, paste0("lex.", c("dur", "Xst", "Cst", "id"))) + if (!is.null(pophaz)) { + reserved <- c(reserved, "d.exp", "pop.haz") + if (pp) reserved <- c(reserved, "pp") + } + + + subs <- substitute(subset) + subset <- evalLogicalSubset(data, subs, enclos = PF) + + stopifnot(is.logical(verbose)) + stopifnot(is.logical(drop)) + stopifnot(is.logical(merge)) + stopifnot(is.logical(pp)) + + ## create Lexis object ------------------------------------------------------- + + lexCols <- c("birth", "entry", "exit", "status", "entry.status", "id") + al <- lapply(lexCols, function(stri) { + + e <- paste0("substitute(", stri, ", env = TF)") + e <- eval(parse(text = e), envir = TF) + + }) + names(al) <- lexCols + + + al <- lapply(al, function(expr) { + + evalPopArg(data = data, arg = expr, DT = TRUE, enclos = PF, + types = c("NULL", "character", "expression")) + + }) + + + al[sapply(names(al), function(stri) { + stri %in% c("id", "entry.status") && is.null(al[[stri]]) + })] <- NULL + + al <- lapply(al, function(elem) { + if (is.data.frame(elem)) return(elem[[1]]) + elem + }) + + al <- c(al, list(data = data, subset = subset)) + names(al)[names(al) == "status"] <- "exit.status" + + al$merge <- merge + + x <- do.call(Lexis_fpa, args = al) + + setDT(x) + setattr(x, "class", c("Lexis", "data.table", "data.frame")) + + + ## more checks --------------------------------------------------------------- + if (!is.null(breaks)) checkBreaksList(x, breaks = breaks) + pophaz <- data.table(pophaz) + alt_phna <- c("year" = "per", "agegroup" = "age") + lapply(names(alt_phna), function(stri) { + if (stri %in% names(pophaz)) setnames(pophaz, stri, alt_phna[stri]) + }) + if (!is.null(pophaz)) checkPophaz(x, pophaz) + + + ## splitting if needed ------------------------------------------------------- + if (!is.null(breaks)) { + x <- splitMulti(x, breaks = breaks, drop = drop) + breaks <- attr(x, "breaks") + } + + ## merge in pophaz if needed ------------------------------------------------- + if (!is.null(pophaz)) { + setnames(pophaz, "haz", "pop.haz") + + phScales <- intersect(allScales, names(pophaz)) + if (!length(phScales)) phScales <- NULL + x <- cutLowMerge(x, pophaz, by = intersect(names(x), names(pophaz)), + all.x = TRUE, all.y = FALSE, mid.scales = phScales, + old.nums = TRUE) + + + if (pp) { + x <- comp_pp_weights(x, surv.scale = "fot", + breaks = breaks, + haz = "pop.haz") + pp_cols <- comp_pp_weighted_figures(x, haz = "pop.haz", + pp = "pp", + by = "lex.id") + } + + } + + ## aggregate if needed ------------------------------------------------------- + ags <- substitute(aggre) + ags_test <- evalRecursive(ags, TF, baseenv()) + if (!is.null(ags_test$arg)) x <- aggre(x, by = ags) + + x +} diff --git a/R/lifetime_function.R b/R/lifetime_function.R index 53e9dcf..95f5c58 100644 --- a/R/lifetime_function.R +++ b/R/lifetime_function.R @@ -1,282 +1,282 @@ -# ##################################################################################################################### -# #Seppa K and Hakulinen T (2009) Mean and median survival times of cancer patients should be corrected for -# #informative censoring. Journal of Clinical Epidemiology 62: 1095-1102. -# # -# #This R script calculates the crude and the bias-reduced estimates of the mean and the median survival times. -# #The script has been tested using R version 2.15.2 for Windows and the Epi package version 1.1.40. -# ##################################################################################################################### -# -# lifetime <- function(patients, age_classes,interval_length,cut_point,end,r, popmort) { # function lifetime -# -# ## INTENTION: only really intended to be tested against since Karri put a lot -# ## of effort into this. -# ## USAGE: E.g. -# # -# # patients <- read.table("some_file.csv") -# # attach(patients) -# # # Variables: -# # # diag_time = time of diagnosis in calender years -# # # diag_age = age at diagnosis in years -# # # d = censoring indicator, d=0 for a censored and d=1 for a deceased individual -# # # t = observed follow-up time in months -# # -# # # Annual survival probabilities of Finnish women stratified by age (1-year age classes: 0-99 years (rows)) and -# # # calender time (1-year caleder periods: 1951-2005 (columns)) -# # popmort <- scan("some_popmort_file") -# # popmort <- matrix(popmort,nrow=100) -# # -# # library(Epi) # Lexis() and splitLexis() functions (included in the Epi package, version 1.1.40) -# # -# # -# # lt <- lifetime( -# # patients = patients, -# # age_classes=c(0,25,45,65,75,100), # Age classes: 0-24, 25-44, 45-64, 65-74, 75-99. -# # interval_length=6, # Length of the interval in months: 6 months. -# # cut_point=c(9,9,9,9,9,9), # Extrapolation is started after 9 years of survival both in the crude analysis (the first coordinate) -# # # and also in the analysis of every age class (the rest of the coordinates). -# # end=100, # Extrapolation is continued for 100 years. -# # r=1, # Constant interval specific relative survival ratio assumed in the extrapolation -# # popmort = popmort -# # # (r<1, if patients have a persistent excess risk of death due to cancer) -# # ) -# # -# # rm(popmort) -# # detach(patients) -# -# diag_age <- evalq(diag_age, envir = patients) -# diag_time <- evalq(diag_time, envir = patients) -# diag_age <- evalq(diag_age, envir = patients) -# -# weights <- NULL -# Results <- matrix(rep(NA,length(age_classes)*4),ncol=length(age_classes)) -# S <- matrix(rep(NA,(min(cut_point)+end)*12/interval_length*length(age_classes)),ncol=length(age_classes)) -# S_star <- matrix(rep(NA,(min(cut_point)+end)*12/interval_length*length(age_classes)),ncol=length(age_classes)) -# var_S <- matrix(rep(NA,(min(cut_point)+end)*12/interval_length*length(age_classes)),ncol=length(age_classes)) -# var_E <- NULL -# -# ################################ -# #Crude and age specific results -# ################################ -# age_c <- NULL -# for(age_c in 1:length(age_classes)) { -# cat("Starting computation using age group ", age_c, -# "/",length(age_classes), "... \n", sep = "") -# if (age_c == 1) { #crude results -# group <- subset(patients, diag_age >= age_classes[age_c] & diag_age < age_classes[length(age_classes)]) -# x <- cut_point[age_c] -# } -# if (age_c > 1) { #age specific results -# group <- subset(patients, diag_age >= age_classes[age_c-1] & diag_age < age_classes[age_c]) -# x <- cut_point[age_c] -# } -# weights[age_c] <- nrow(group) -# -# ##################### -# #Observed life table -# ##################### -# attach(group,warn.conflicts=F) -# -# lexis <- NULL -# lex <- NULL -# lexis <- Lexis(entry=rep(0,weights[age_c]), duration=t, exit.status = d) -# cat("Splitting... \n") -# lex <- splitLexisDT(lexis, breaks = seq(0,240,interval_length), -# timeScale = timeScales(lexis)[1], drop = FALSE) -# # lex <- splitLexis(lexis, breaks = seq(0,240,interval_length)) -# setDF(lex) -# cat("done splitting. \n") -# -# intervals<-ceiling(max(t/interval_length)) -# l <- NULL -# w <- NULL -# l_eff <- NULL -# deaths <- NULL -# p <- NULL -# -# cat("looping over intervals... \n") -# setDT(lex) -# l <- lex[.(entry = (1:(intervals+1)-1)*interval_length), -# .(obs = .N), on = "entry", by = .EACHI]$obs -# deaths <- lex[.(lex.Xst = 1, entry = (1:(intervals)-1)*interval_length), -# .(obs = .N), on = c("lex.Xst","entry"), by = .EACHI]$obs -# # for(j in 1:(intervals+1)) l[j] <- sum(lex$entry == (j-1)*interval_length) -# # for(j in 1:(intervals)) deaths[j] <- sum(lex$entry == (j-1)*interval_length & lex$lex.Xst == 1 ) -# -# w <- -diff(l)-deaths -# l <- l[-(intervals+1)] -# l_eff <- l - w/2 -# p <- 1 - deaths/l_eff -# -# cumul <- cumprod(p) -# -# ################################################### -# #Interval specific expected survival probabilities -# ################################################### -# cat("Interval-specific exp survs... \n") -# p_star <- matrix(rep(NA,(end+x)*12/interval_length*nrow(group)),ncol=(end+x)*12/interval_length) -# for(i in 1:((end+x)*12/interval_length)) { -# rows <- ceiling(diag_age+(interval_length/12)*(i-1)) -# columns <- floor(diag_time+(interval_length/12)*(i-1)-1950) -# rows[rows>nrow(popmort)] <- nrow(popmort) -# columns[columns>ncol(popmort)] <- ncol(popmort) -# p_star[,i] <- popmort[cbind(rows,columns)]^(interval_length/12) -# } -# -# # Cumulative survival proportions of a comparable group in the general population -# s_e <- c(1,apply(apply(p_star,1,cumprod),1,mean)) -# -# ########################################################### -# #Mean and median estimates, if extrapolation is NOT needed -# ########################################################### -# if (l[intervals]==deaths[intervals] & x*12/interval_length>intervals) { -# cat("mean and median estimates (no extrapolation)... \n") -# -# s_p <- c(1,cumul,rep(0,x*12/interval_length-intervals)) -# -# # Mean and median estimates -# E <- (0.5+sum(s_p[-1]))*interval_length/12 -# E_star <- (0.5+sum(s_e[-1],na.rm = T))*interval_length/12 -# -# time <- seq(0,length(s_p)-1,1) -# md <- (max(time[s_p >= 0.5]) + (min(s_p[s_p >= 0.5]) - 0.5)/(min(s_p[s_p >= 0.5]) - max(s_p[s_p < 0.5])))*interval_length/12 -# md_star <- (max(time[s_e >= 0.5]) + (min(s_e[s_e >= 0.5]) - 0.5)/(min(s_e[s_e >= 0.5]) - max(s_e[s_e < 0.5])))*interval_length/12 -# -# S[,age_c] <- c(s_p,rep(0,nrow(S)-length(s_p))) -# S_star[,age_c] <- s_e[1:nrow(S_star)] -# -# Results[,age_c] <- c(E,E_star,md,md_star) -# -# # Variance estimates -# f <- NULL -# for(k in 1:(intervals-2)) { -# f[k] <- c(1,cumul)[k]*( 1 + sum(cumprod(p[(k+1):(intervals-1)])) ) -# } -# f[intervals-1] <- c(1,cumul)[intervals-1] -# f[intervals] <- c(1,cumul)[intervals] -# -# var_p <- p*(1-p)/(l-w/2) -# var_E[age_c] <- sum(f^2*var_p) -# -# var_S[1,age_c] <- 0 -# var_S[2:intervals,age_c] <- cumul[1:(intervals-1)]^2 * cumsum( deaths/(l_eff*(l_eff-deaths)) )[1:(intervals-1)] -# var_S[(intervals+1):nrow(var_S),age_c] <- 0 -# -# } #if extrapolation is NOT needed -# -# ####################################################### -# #Mean and median estimates, if extrapolation is needed -# ####################################################### -# if (x*12/interval_length<=intervals) { -# -# cat("mean and median estimates (using extrapolation)... \n") -# -# # Cumulative survival probabilities for a patient alive at the beginning of (x*12/interval_length+1)th interval -# s_x_star <- r^(seq(1,end*12/interval_length)*interval_length/12) * -# apply(matrix(p_star[lex$lex.id[lex$entry==x*12],(x*12/interval_length+1):((end+x)*12/interval_length)],ncol=end*12/interval_length),1,cumprod) -# -# # Expected survival proportions for the whole patient group alive at the beginning of the (x*12/interval_length+1)th interval -# s_x_star_group <- apply(s_x_star,1,mean) -# -# # Extrapolated cumulative survival proportions for a patient group (extrapolated from x to x + end years) -# s_p <- c(1,cumul[1:(x*12/interval_length)],cumul[(x*12/interval_length)]*s_x_star_group) -# -# # Mean and median estimates -# E <- (0.5+sum(s_p[-1]))*interval_length/12 -# E_star <- (0.5+sum(s_e[-1],na.rm = T))*interval_length/12 -# -# time <- seq(0,nrow(S)-1,1) -# md <- (max(time[s_p >= 0.5]) + (min(s_p[s_p >= 0.5]) - 0.5)/(min(s_p[s_p >= 0.5]) - max(s_p[s_p < 0.5])))*interval_length/12 -# md_star <- (max(time[s_e >= 0.5]) + (min(s_e[s_e >= 0.5]) - 0.5)/(min(s_e[s_e >= 0.5]) - max(s_e[s_e < 0.5])))*interval_length/12 -# -# S[,age_c] <- s_p[1:nrow(S)] -# S_star[,age_c] <- s_e[1:nrow(S_star)] -# -# Results[,age_c] <- c(E,E_star,md,md_star) -# -# # Variance estimates -# f <- NULL -# for(k in 1:(x*12/interval_length-2)) { -# f[k] <- c(1,cumul)[k]*( 1 + sum(cumprod(p[(k+1):(x*12/interval_length-1)])) + prod(p[(k+1):(x*12/interval_length)])*(1+sum(s_x_star_group)) ) -# } -# f[x*12/interval_length-1] <- c(1,cumul)[x*12/interval_length-1]*( 1 + p[x*12/interval_length]*(1+sum(s_x_star_group)) ) -# f[x*12/interval_length] <- c(1,cumul)[x*12/interval_length]*(1+sum(s_x_star_group)) -# -# var_p <- p*(1-p)/(l-w/2) -# var_E[age_c] <- sum(f^2*var_p[1:(x*12/interval_length)]) -# -# var_S[1,age_c] <- 0 -# var_S[2:(x*12/interval_length+1),age_c] <- (cumul^2 * cumsum( deaths/(l_eff*(l_eff-deaths)) ) )[1:(x*12/interval_length)] -# var_S[(x*12/interval_length+2):nrow(var_S),age_c] <- s_x_star_group[-length(s_x_star_group)]^2 * (cumul^2 * cumsum( deaths/(l_eff*(l_eff-deaths)) ) )[x*12/interval_length] -# -# } #if extrapolation is needed -# -# } #for age classes -# -# ######################## -# #Bias-reduced estimates -# ######################## -# -# cat("overall bias-reduced estimates... \n") -# -# # Bias-reduced mean -# E_weighted <- NULL -# for(i in 1:2) { -# E_weighted <- c( E_weighted,weighted.mean(Results[i,2:length(age_classes)],weights[2:length(age_classes)]) ) -# } -# -# # Bias-reduced cumulative survival proportions -# S_weighted <- NULL -# for(i in 1:nrow(S)) { -# S_weighted <- c( S_weighted,weighted.mean(S[i,2:length(age_classes)],weights[2:length(age_classes)]) ) -# } -# -# # Bias-reduced median -# time <- seq(0,nrow(S)-1,1) -# md_bias_reduced <- ( max(time[S_weighted >= 0.5]) + (min(S_weighted[S_weighted >= 0.5]) - 0.5)/ -# (min(S_weighted[S_weighted >= 0.5]) - max(S_weighted[S_weighted < 0.5])) )*interval_length/12 -# -# # Standard errors of the mean -# SE_E <- sqrt( (1/weights[1]^2)*sum( (weights[2:length(age_classes)]^2)*var_E[2:length(age_classes)] ) )*interval_length/12 -# SE_E_crude <- sqrt(var_E[1])*interval_length/12 -# -# # Standard errors and 95% CI's for the cumulative survival proportions -# SE_s_p_ages <- matrix(rep(NA,(min(cut_point)+end)*12/interval_length*(length(age_classes)-1)),ncol=(length(age_classes)-1)) -# for(i in 2:length(age_classes)) { -# SE_s_p_ages[,i-1] <- weights[i]^2*var_S[,i] -# } -# SE_s_p <- sqrt( (1/weights[1]^2)*apply( SE_s_p_ages, 1, sum ) ) -# SE_s_p_crude <- sqrt( var_S[,1] ) -# -# S_l <- S_weighted - qnorm(0.975)*SE_s_p -# S_u <- S_weighted + qnorm(0.975)*SE_s_p -# S_l_crude <- S[,1] - qnorm(0.975)*SE_s_p_crude -# S_u_crude <- S[,1] + qnorm(0.975)*SE_s_p_crude -# -# # 95% CI's for the crude and the bias-reduced mean -# CI_E_crude <- c(Results[1,1]-qnorm(0.975)*SE_E_crude,Results[1,1]+qnorm(0.975)*SE_E_crude) -# CI_E_weighted <- c(E_weighted[1]-qnorm(0.975)*SE_E,E_weighted[1]+qnorm(0.975)*SE_E) -# -# # The first and the last point where the 95% CI of the cumulative survival proportion covers the value of 50% -# CI_md_crude <- c(min(time[S_l_crude < 0.5])*interval_length/12,max(time[S_u_crude > 0.5])*interval_length/12) -# CI_md_bias_reduced <- c(min(time[S_l < 0.5])*interval_length/12,max(time[S_u > 0.5])*interval_length/12) -# -# ######################## -# #Printing the estimates -# ######################## -# -# table <- rbind(t(Results[,2:length(age_classes)]),Results[,1],c(E_weighted,md_bias_reduced,Results[4,1])) -# -# stratification <- NULL -# for(i in 1:(length(age_classes)-1)) { -# stratification <- c(stratification,paste(age_classes[i],"-",age_classes[i+1]-1,sep="")) -# } -# -# list( -# matrix(cbind(round(cbind(table,table[,2]-table[,1],table[,4]-table[,3]),1),round(cbind((table[,2]-table[,1])/table[,2],(table[,4]-table[,3])/table[,4]),2)),ncol=8, -# dimnames=list(c(stratification,"Crude","Bias-reduced"),c("E","E*","Md","Md*","E*-E","Md*-Md","(E*-E)/E*","(Md*-Md)/Md*")) ), -# matrix(round(rbind(CI_E_crude,CI_E_weighted,CI_md_crude,CI_md_bias_reduced),1),ncol=2, -# dimnames=list(c("Crude mean","Bias-reduced mean","Crude median","Bias-reduced median"),c("95% CI","")) ) -# ) -# -# } #function lifetime -# +# ##################################################################################################################### +# #Seppa K and Hakulinen T (2009) Mean and median survival times of cancer patients should be corrected for +# #informative censoring. Journal of Clinical Epidemiology 62: 1095-1102. +# # +# #This R script calculates the crude and the bias-reduced estimates of the mean and the median survival times. +# #The script has been tested using R version 2.15.2 for Windows and the Epi package version 1.1.40. +# ##################################################################################################################### +# +# lifetime <- function(patients, age_classes,interval_length,cut_point,end,r, popmort) { # function lifetime +# +# ## INTENTION: only really intended to be tested against since Karri put a lot +# ## of effort into this. +# ## USAGE: E.g. +# # +# # patients <- read.table("some_file.csv") +# # attach(patients) +# # # Variables: +# # # diag_time = time of diagnosis in calender years +# # # diag_age = age at diagnosis in years +# # # d = censoring indicator, d=0 for a censored and d=1 for a deceased individual +# # # t = observed follow-up time in months +# # +# # # Annual survival probabilities of Finnish women stratified by age (1-year age classes: 0-99 years (rows)) and +# # # calender time (1-year caleder periods: 1951-2005 (columns)) +# # popmort <- scan("some_popmort_file") +# # popmort <- matrix(popmort,nrow=100) +# # +# # library(Epi) # Lexis() and splitLexis() functions (included in the Epi package, version 1.1.40) +# # +# # +# # lt <- lifetime( +# # patients = patients, +# # age_classes=c(0,25,45,65,75,100), # Age classes: 0-24, 25-44, 45-64, 65-74, 75-99. +# # interval_length=6, # Length of the interval in months: 6 months. +# # cut_point=c(9,9,9,9,9,9), # Extrapolation is started after 9 years of survival both in the crude analysis (the first coordinate) +# # # and also in the analysis of every age class (the rest of the coordinates). +# # end=100, # Extrapolation is continued for 100 years. +# # r=1, # Constant interval specific relative survival ratio assumed in the extrapolation +# # popmort = popmort +# # # (r<1, if patients have a persistent excess risk of death due to cancer) +# # ) +# # +# # rm(popmort) +# # detach(patients) +# +# diag_age <- evalq(diag_age, envir = patients) +# diag_time <- evalq(diag_time, envir = patients) +# diag_age <- evalq(diag_age, envir = patients) +# +# weights <- NULL +# Results <- matrix(rep(NA,length(age_classes)*4),ncol=length(age_classes)) +# S <- matrix(rep(NA,(min(cut_point)+end)*12/interval_length*length(age_classes)),ncol=length(age_classes)) +# S_star <- matrix(rep(NA,(min(cut_point)+end)*12/interval_length*length(age_classes)),ncol=length(age_classes)) +# var_S <- matrix(rep(NA,(min(cut_point)+end)*12/interval_length*length(age_classes)),ncol=length(age_classes)) +# var_E <- NULL +# +# ################################ +# #Crude and age specific results +# ################################ +# age_c <- NULL +# for(age_c in 1:length(age_classes)) { +# cat("Starting computation using age group ", age_c, +# "/",length(age_classes), "... \n", sep = "") +# if (age_c == 1) { #crude results +# group <- subset(patients, diag_age >= age_classes[age_c] & diag_age < age_classes[length(age_classes)]) +# x <- cut_point[age_c] +# } +# if (age_c > 1) { #age specific results +# group <- subset(patients, diag_age >= age_classes[age_c-1] & diag_age < age_classes[age_c]) +# x <- cut_point[age_c] +# } +# weights[age_c] <- nrow(group) +# +# ##################### +# #Observed life table +# ##################### +# attach(group,warn.conflicts=F) +# +# lexis <- NULL +# lex <- NULL +# lexis <- Lexis(entry=rep(0,weights[age_c]), duration=t, exit.status = d) +# cat("Splitting... \n") +# lex <- splitLexisDT(lexis, breaks = seq(0,240,interval_length), +# timeScale = timeScales(lexis)[1], drop = FALSE) +# # lex <- splitLexis(lexis, breaks = seq(0,240,interval_length)) +# setDF(lex) +# cat("done splitting. \n") +# +# intervals<-ceiling(max(t/interval_length)) +# l <- NULL +# w <- NULL +# l_eff <- NULL +# deaths <- NULL +# p <- NULL +# +# cat("looping over intervals... \n") +# setDT(lex) +# l <- lex[.(entry = (1:(intervals+1)-1)*interval_length), +# .(obs = .N), on = "entry", by = .EACHI]$obs +# deaths <- lex[.(lex.Xst = 1, entry = (1:(intervals)-1)*interval_length), +# .(obs = .N), on = c("lex.Xst","entry"), by = .EACHI]$obs +# # for(j in 1:(intervals+1)) l[j] <- sum(lex$entry == (j-1)*interval_length) +# # for(j in 1:(intervals)) deaths[j] <- sum(lex$entry == (j-1)*interval_length & lex$lex.Xst == 1 ) +# +# w <- -diff(l)-deaths +# l <- l[-(intervals+1)] +# l_eff <- l - w/2 +# p <- 1 - deaths/l_eff +# +# cumul <- cumprod(p) +# +# ################################################### +# #Interval specific expected survival probabilities +# ################################################### +# cat("Interval-specific exp survs... \n") +# p_star <- matrix(rep(NA,(end+x)*12/interval_length*nrow(group)),ncol=(end+x)*12/interval_length) +# for(i in 1:((end+x)*12/interval_length)) { +# rows <- ceiling(diag_age+(interval_length/12)*(i-1)) +# columns <- floor(diag_time+(interval_length/12)*(i-1)-1950) +# rows[rows>nrow(popmort)] <- nrow(popmort) +# columns[columns>ncol(popmort)] <- ncol(popmort) +# p_star[,i] <- popmort[cbind(rows,columns)]^(interval_length/12) +# } +# +# # Cumulative survival proportions of a comparable group in the general population +# s_e <- c(1,apply(apply(p_star,1,cumprod),1,mean)) +# +# ########################################################### +# #Mean and median estimates, if extrapolation is NOT needed +# ########################################################### +# if (l[intervals]==deaths[intervals] & x*12/interval_length>intervals) { +# cat("mean and median estimates (no extrapolation)... \n") +# +# s_p <- c(1,cumul,rep(0,x*12/interval_length-intervals)) +# +# # Mean and median estimates +# E <- (0.5+sum(s_p[-1]))*interval_length/12 +# E_star <- (0.5+sum(s_e[-1],na.rm = T))*interval_length/12 +# +# time <- seq(0,length(s_p)-1,1) +# md <- (max(time[s_p >= 0.5]) + (min(s_p[s_p >= 0.5]) - 0.5)/(min(s_p[s_p >= 0.5]) - max(s_p[s_p < 0.5])))*interval_length/12 +# md_star <- (max(time[s_e >= 0.5]) + (min(s_e[s_e >= 0.5]) - 0.5)/(min(s_e[s_e >= 0.5]) - max(s_e[s_e < 0.5])))*interval_length/12 +# +# S[,age_c] <- c(s_p,rep(0,nrow(S)-length(s_p))) +# S_star[,age_c] <- s_e[1:nrow(S_star)] +# +# Results[,age_c] <- c(E,E_star,md,md_star) +# +# # Variance estimates +# f <- NULL +# for(k in 1:(intervals-2)) { +# f[k] <- c(1,cumul)[k]*( 1 + sum(cumprod(p[(k+1):(intervals-1)])) ) +# } +# f[intervals-1] <- c(1,cumul)[intervals-1] +# f[intervals] <- c(1,cumul)[intervals] +# +# var_p <- p*(1-p)/(l-w/2) +# var_E[age_c] <- sum(f^2*var_p) +# +# var_S[1,age_c] <- 0 +# var_S[2:intervals,age_c] <- cumul[1:(intervals-1)]^2 * cumsum( deaths/(l_eff*(l_eff-deaths)) )[1:(intervals-1)] +# var_S[(intervals+1):nrow(var_S),age_c] <- 0 +# +# } #if extrapolation is NOT needed +# +# ####################################################### +# #Mean and median estimates, if extrapolation is needed +# ####################################################### +# if (x*12/interval_length<=intervals) { +# +# cat("mean and median estimates (using extrapolation)... \n") +# +# # Cumulative survival probabilities for a patient alive at the beginning of (x*12/interval_length+1)th interval +# s_x_star <- r^(seq(1,end*12/interval_length)*interval_length/12) * +# apply(matrix(p_star[lex$lex.id[lex$entry==x*12],(x*12/interval_length+1):((end+x)*12/interval_length)],ncol=end*12/interval_length),1,cumprod) +# +# # Expected survival proportions for the whole patient group alive at the beginning of the (x*12/interval_length+1)th interval +# s_x_star_group <- apply(s_x_star,1,mean) +# +# # Extrapolated cumulative survival proportions for a patient group (extrapolated from x to x + end years) +# s_p <- c(1,cumul[1:(x*12/interval_length)],cumul[(x*12/interval_length)]*s_x_star_group) +# +# # Mean and median estimates +# E <- (0.5+sum(s_p[-1]))*interval_length/12 +# E_star <- (0.5+sum(s_e[-1],na.rm = T))*interval_length/12 +# +# time <- seq(0,nrow(S)-1,1) +# md <- (max(time[s_p >= 0.5]) + (min(s_p[s_p >= 0.5]) - 0.5)/(min(s_p[s_p >= 0.5]) - max(s_p[s_p < 0.5])))*interval_length/12 +# md_star <- (max(time[s_e >= 0.5]) + (min(s_e[s_e >= 0.5]) - 0.5)/(min(s_e[s_e >= 0.5]) - max(s_e[s_e < 0.5])))*interval_length/12 +# +# S[,age_c] <- s_p[1:nrow(S)] +# S_star[,age_c] <- s_e[1:nrow(S_star)] +# +# Results[,age_c] <- c(E,E_star,md,md_star) +# +# # Variance estimates +# f <- NULL +# for(k in 1:(x*12/interval_length-2)) { +# f[k] <- c(1,cumul)[k]*( 1 + sum(cumprod(p[(k+1):(x*12/interval_length-1)])) + prod(p[(k+1):(x*12/interval_length)])*(1+sum(s_x_star_group)) ) +# } +# f[x*12/interval_length-1] <- c(1,cumul)[x*12/interval_length-1]*( 1 + p[x*12/interval_length]*(1+sum(s_x_star_group)) ) +# f[x*12/interval_length] <- c(1,cumul)[x*12/interval_length]*(1+sum(s_x_star_group)) +# +# var_p <- p*(1-p)/(l-w/2) +# var_E[age_c] <- sum(f^2*var_p[1:(x*12/interval_length)]) +# +# var_S[1,age_c] <- 0 +# var_S[2:(x*12/interval_length+1),age_c] <- (cumul^2 * cumsum( deaths/(l_eff*(l_eff-deaths)) ) )[1:(x*12/interval_length)] +# var_S[(x*12/interval_length+2):nrow(var_S),age_c] <- s_x_star_group[-length(s_x_star_group)]^2 * (cumul^2 * cumsum( deaths/(l_eff*(l_eff-deaths)) ) )[x*12/interval_length] +# +# } #if extrapolation is needed +# +# } #for age classes +# +# ######################## +# #Bias-reduced estimates +# ######################## +# +# cat("overall bias-reduced estimates... \n") +# +# # Bias-reduced mean +# E_weighted <- NULL +# for(i in 1:2) { +# E_weighted <- c( E_weighted,weighted.mean(Results[i,2:length(age_classes)],weights[2:length(age_classes)]) ) +# } +# +# # Bias-reduced cumulative survival proportions +# S_weighted <- NULL +# for(i in 1:nrow(S)) { +# S_weighted <- c( S_weighted,weighted.mean(S[i,2:length(age_classes)],weights[2:length(age_classes)]) ) +# } +# +# # Bias-reduced median +# time <- seq(0,nrow(S)-1,1) +# md_bias_reduced <- ( max(time[S_weighted >= 0.5]) + (min(S_weighted[S_weighted >= 0.5]) - 0.5)/ +# (min(S_weighted[S_weighted >= 0.5]) - max(S_weighted[S_weighted < 0.5])) )*interval_length/12 +# +# # Standard errors of the mean +# SE_E <- sqrt( (1/weights[1]^2)*sum( (weights[2:length(age_classes)]^2)*var_E[2:length(age_classes)] ) )*interval_length/12 +# SE_E_crude <- sqrt(var_E[1])*interval_length/12 +# +# # Standard errors and 95% CI's for the cumulative survival proportions +# SE_s_p_ages <- matrix(rep(NA,(min(cut_point)+end)*12/interval_length*(length(age_classes)-1)),ncol=(length(age_classes)-1)) +# for(i in 2:length(age_classes)) { +# SE_s_p_ages[,i-1] <- weights[i]^2*var_S[,i] +# } +# SE_s_p <- sqrt( (1/weights[1]^2)*apply( SE_s_p_ages, 1, sum ) ) +# SE_s_p_crude <- sqrt( var_S[,1] ) +# +# S_l <- S_weighted - qnorm(0.975)*SE_s_p +# S_u <- S_weighted + qnorm(0.975)*SE_s_p +# S_l_crude <- S[,1] - qnorm(0.975)*SE_s_p_crude +# S_u_crude <- S[,1] + qnorm(0.975)*SE_s_p_crude +# +# # 95% CI's for the crude and the bias-reduced mean +# CI_E_crude <- c(Results[1,1]-qnorm(0.975)*SE_E_crude,Results[1,1]+qnorm(0.975)*SE_E_crude) +# CI_E_weighted <- c(E_weighted[1]-qnorm(0.975)*SE_E,E_weighted[1]+qnorm(0.975)*SE_E) +# +# # The first and the last point where the 95% CI of the cumulative survival proportion covers the value of 50% +# CI_md_crude <- c(min(time[S_l_crude < 0.5])*interval_length/12,max(time[S_u_crude > 0.5])*interval_length/12) +# CI_md_bias_reduced <- c(min(time[S_l < 0.5])*interval_length/12,max(time[S_u > 0.5])*interval_length/12) +# +# ######################## +# #Printing the estimates +# ######################## +# +# table <- rbind(t(Results[,2:length(age_classes)]),Results[,1],c(E_weighted,md_bias_reduced,Results[4,1])) +# +# stratification <- NULL +# for(i in 1:(length(age_classes)-1)) { +# stratification <- c(stratification,paste(age_classes[i],"-",age_classes[i+1]-1,sep="")) +# } +# +# list( +# matrix(cbind(round(cbind(table,table[,2]-table[,1],table[,4]-table[,3]),1),round(cbind((table[,2]-table[,1])/table[,2],(table[,4]-table[,3])/table[,4]),2)),ncol=8, +# dimnames=list(c(stratification,"Crude","Bias-reduced"),c("E","E*","Md","Md*","E*-E","Md*-Md","(E*-E)/E*","(Md*-Md)/Md*")) ), +# matrix(round(rbind(CI_E_crude,CI_E_weighted,CI_md_crude,CI_md_bias_reduced),1),ncol=2, +# dimnames=list(c("Crude mean","Bias-reduced mean","Crude median","Bias-reduced median"),c("95% CI","")) ) +# ) +# +# } #function lifetime +# diff --git a/R/ltable.R b/R/ltable.R index 0647b82..67c1538 100644 --- a/R/ltable.R +++ b/R/ltable.R @@ -1,251 +1,251 @@ -#' @title Tabulate Counts and Other Functions by Multiple Variables into a -#' Long-Format Table -#' @author Joonas Miettinen, Matti Rantanen -#' @description \code{ltable} makes use of \code{data.table} -#' capabilities to tabulate frequencies or -#' arbitrary functions of given variables into a long format -#' \code{data.table}/\code{data.frame}. \code{expr.by.cj} is the -#' equivalent for more advanced users. -#' @param data a \code{data.table}/\code{data.frame} -#' @param by.vars names of variables that are used for categorization, -#' as a character vector, e.g. \code{c('sex','agegroup')} -#' @param expr object or a list of objects where each object is a function -#' of a variable (see: details) -#' @param subset a logical condition; data is limited accordingly before -#' evaluating \code{expr} - but the result of \code{expr} is also -#' returned as \code{NA} for levels not existing in the subset. See Examples. -#' @param use.levels logical; if \code{TRUE}, uses factor levels of given -#' variables if present; if you want e.g. counts for levels -#' that actually have zero observations but are levels in a factor variable, -#' use this -#' @param na.rm logical; if \code{TRUE}, drops rows in table that have -#' \code{NA} as values in any of \code{by.vars} columns -#' @param robust logical; if \code{TRUE}, runs the output data's -#' \code{by.vars} columns through \code{robust_values} before outputting -#' @param .SDcols advanced; a character vector of column names -#' passed to inside the data.table's brackets -#' \code{DT[, , ...]}; see \code{\link{data.table}}; if \code{NULL}, -#' uses all appropriate columns. See Examples for usage. -#' @param enclos advanced; an environment; the enclosing -#' environment of the data. -#' @param ... advanced; other arguments passed to inside the -#' data.table's brackets \code{DT[, , ...]}; see \code{\link{data.table}} -#' -#' @import data.table -#' -#' @details -#' -#' Returns \code{expr} for each unique combination of given \code{by.vars}. -#' -#' By default makes use of any and all \code{\link{levels}} present for -#' each variable in \code{by.vars}. This is useful, -#' because even if a subset of the data does not contain observations -#' for e.g. a specific age group, those age groups are -#' nevertheless presented in the resulting table; e.g. with the default -#' \code{expr = list(obs = .N)} all age group levels -#' are represented by a row and can have \code{obs = 0}. -#' -#' The function differs from the -#' vanilla \code{\link{table}} by giving a long format table of values -#' regardless of the number of \code{by.vars} given. -#' Make use of e.g. \code{\link{cast_simple}} if data needs to be -#' presented in a wide format (e.g. a two-way table). -#' -#' The rows of the long-format table are effectively Cartesian products -#' of the levels of each variable in \code{by.vars}, -#' e.g. with \code{by.vars = c("sex", "area")} all levels of -#' \code{area} are repeated for both levels of \code{sex} -#' in the table. -#' -#' The \code{expr} allows the user to apply any function(s) on all -#' levels defined by \code{by.vars}. Here are some examples: -#' \itemize{ -#' \item .N or list(.N) is a function used inside a \code{data.table} to -#' calculate counts in each group -#' \item list(obs = .N), same as above but user assigned variable name -#' \item list(sum(obs), sum(pyrs), mean(dg_age)), multiple objects in a list -#' \item list(obs = sum(obs), pyrs = sum(pyrs)), same as above with user -#' defined variable names -#' } -#' -#' If \code{use.levels = FALSE}, no \code{levels} information will -#' be used. This means that if e.g. the \code{agegroup} -#' variable is a factor and has 18 levels defined, but only 15 levels -#' are present in the data, no rows for the missing -#' levels will be shown in the table. -#' -#' \code{na.rm} simply drops any rows from the resulting table where -#' any of the \code{by.vars} values was \code{NA}. -#' -#' @seealso -#' \code{\link{table}}, \code{\link{cast_simple}}, \code{\link{melt}} -#' -#' @export ltable -#' -#' @examples -#' data("sire", package = "popEpi") -#' sr <- sire -#' sr$agegroup <- cut(sr$dg_age, breaks=c(0,45,60,75,85,Inf)) -#' ## counts by default -#' ltable(sr, "agegroup") -#' -#' ## any expression can be given -#' ltable(sr, "agegroup", list(mage = mean(dg_age))) -#' ltable(sr, "agegroup", list(mage = mean(dg_age), vage = var(dg_age))) -#' -#' ## also returns levels where there are zero rows (expressions as NA) -#' ltable(sr, "agegroup", list(obs = .N, -#' minage = min(dg_age), -#' maxage = max(dg_age)), -#' subset = dg_age < 85) -#' -#' #### expr.by.cj -#' expr.by.cj(sr, "agegroup") -#' -#' ## any arbitrary expression can be given -#' expr.by.cj(sr, "agegroup", list(mage = mean(dg_age))) -#' expr.by.cj(sr, "agegroup", list(mage = mean(dg_age), vage = var(dg_age))) -#' -#' ## only uses levels of by.vars present in data -#' expr.by.cj(sr, "agegroup", list(mage = mean(dg_age), vage = var(dg_age)), -#' subset = dg_age < 70) -#' -#' ## .SDcols trick -#' expr.by.cj(sr, "agegroup", lapply(.SD, mean), -#' subset = dg_age < 70, .SDcols = c("dg_age", "status")) - -ltable <- function(data, - by.vars = NULL, - expr = list(obs = .N), - subset = NULL, - use.levels = TRUE, - na.rm = FALSE, - robust = TRUE) { - - PF <- parent.frame() - TF <- environment() - - e <- substitute(expr) - - ## eval subset --------------------------------------------------------------- - subset <- substitute(subset) - subset <- evalLogicalSubset(data, subset, enclos = PF) - - ## create table -------------------------------------------------------------- - res <- expr.by.cj(data = data, - by.vars = by.vars, - expr = e, - subset = subset, - use.levels = use.levels, - na.rm = na.rm, - robust = robust) - - - ## final touch --------------------------------------------------------------- - - if (!return_DT()) { - setDFpe(res) - } - res - -} - - - - -#' @describeIn ltable Somewhat more streamlined \code{ltable} with -#' defaults for speed. Explicit determination of enclosing environment -#' of data. -#' @export expr.by.cj - -expr.by.cj <- function(data, - by.vars = NULL, - expr = list(obs = .N), - subset = NULL, - use.levels = FALSE, - na.rm = FALSE, - robust = FALSE, - .SDcols = NULL, - enclos = parent.frame(1L), - ...) { - - PF <- enclos - TF <- environment() - - - ## checks -------------------------------------------------------------------- - if (!is.data.frame(data)) { - stop("Argument 'data' must be data.frame (data.table is fine too)") - } - - stopifnot(is.environment(enclos)) - stopifnot(is.logical(na.rm)) - stopifnot(is.logical(use.levels)) - - stopifnot(is.character(by.vars) || is.null(by.vars)) - all_names_present(data, c(by.vars)) - - stopifnot(is.character(.SDcols) || is.null(.SDcols)) - all_names_present(data, .SDcols) - - tab <- data.table(data[1:min(10, nrow(data)),]) - e <- substitute(expr) - e <- tab[, evalRecursive(e, env = .SD, enc = PF)$argSub] - - ## eval subset --------------------------------------------------------------- - subset <- substitute(subset) - subset <- evalLogicalSubset(data, subset, enclos = PF) - - ## retrieve data to use without taking copy ---------------------------------- - - tabVars <- unique(c(by.vars, all.vars(e), .SDcols)) - tabVars <- intersect(names(data), tabVars) - - tab <- mget(tabVars, envir = as.environment(data)) - setDT(tab) - - tmpDum <- makeTempVarName(data, pre = "dummy_") - if (!length(by.vars)) { - if (!length(tab)) { - ## no by.vars nor variables in expr - tab <- data.table(rep(1L, nrow(data))) - setnames(tab, "V1", tmpDum) - } else { - tab[, c(tmpDum) := 1L] - } - by.vars <- tmpDum - } - - ## create joining table ------------------------------------------------------ - lev_fun <- function(x) { - if (use.levels && is.factor(x)) { - factor(levels(x), levels = levels(x)) - } else { - sort(unique(x), na.last = TRUE) - } - } - cj <- lapply(as.list(tab)[by.vars], lev_fun) - cj <- do.call(CJ, c(cj, unique = FALSE, sorted = FALSE)) - if (na.rm) cj <- na.omit(cj) - - ## eval expression ----------------------------------------------------------- - tabe <- "tab[subset][cj, eval(e), - on = by.vars, - by = .EACHI, ..." - tabe <- if (is.null(.SDcols)) tabe else paste0(tabe, ", .SDcols = .SDcols") - tabe <- paste0(tabe ,"]") - res <- eval(parse(text = tabe)) - - setcolsnull(res, delete = tmpDum, soft = TRUE) - by.vars <- setdiff(by.vars, tmpDum) - - ## final touch --------------------------------------------------------------- - if (length(res)) setcolorder(res, c(by.vars, setdiff(names(res), by.vars))) - if (length(by.vars)) setkeyv(res, by.vars) - if (!return_DT()) { - setDFpe(res) - } - res - -} - +#' @title Tabulate Counts and Other Functions by Multiple Variables into a +#' Long-Format Table +#' @author Joonas Miettinen, Matti Rantanen +#' @description \code{ltable} makes use of \code{data.table} +#' capabilities to tabulate frequencies or +#' arbitrary functions of given variables into a long format +#' \code{data.table}/\code{data.frame}. \code{expr.by.cj} is the +#' equivalent for more advanced users. +#' @param data a \code{data.table}/\code{data.frame} +#' @param by.vars names of variables that are used for categorization, +#' as a character vector, e.g. \code{c('sex','agegroup')} +#' @param expr object or a list of objects where each object is a function +#' of a variable (see: details) +#' @param subset a logical condition; data is limited accordingly before +#' evaluating \code{expr} - but the result of \code{expr} is also +#' returned as \code{NA} for levels not existing in the subset. See Examples. +#' @param use.levels logical; if \code{TRUE}, uses factor levels of given +#' variables if present; if you want e.g. counts for levels +#' that actually have zero observations but are levels in a factor variable, +#' use this +#' @param na.rm logical; if \code{TRUE}, drops rows in table that have +#' \code{NA} as values in any of \code{by.vars} columns +#' @param robust logical; if \code{TRUE}, runs the output data's +#' \code{by.vars} columns through \code{robust_values} before outputting +#' @param .SDcols advanced; a character vector of column names +#' passed to inside the data.table's brackets +#' \code{DT[, , ...]}; see \code{\link{data.table}}; if \code{NULL}, +#' uses all appropriate columns. See Examples for usage. +#' @param enclos advanced; an environment; the enclosing +#' environment of the data. +#' @param ... advanced; other arguments passed to inside the +#' data.table's brackets \code{DT[, , ...]}; see \code{\link{data.table}} +#' +#' @import data.table +#' +#' @details +#' +#' Returns \code{expr} for each unique combination of given \code{by.vars}. +#' +#' By default makes use of any and all \code{\link{levels}} present for +#' each variable in \code{by.vars}. This is useful, +#' because even if a subset of the data does not contain observations +#' for e.g. a specific age group, those age groups are +#' nevertheless presented in the resulting table; e.g. with the default +#' \code{expr = list(obs = .N)} all age group levels +#' are represented by a row and can have \code{obs = 0}. +#' +#' The function differs from the +#' vanilla \code{\link{table}} by giving a long format table of values +#' regardless of the number of \code{by.vars} given. +#' Make use of e.g. \code{\link{cast_simple}} if data needs to be +#' presented in a wide format (e.g. a two-way table). +#' +#' The rows of the long-format table are effectively Cartesian products +#' of the levels of each variable in \code{by.vars}, +#' e.g. with \code{by.vars = c("sex", "area")} all levels of +#' \code{area} are repeated for both levels of \code{sex} +#' in the table. +#' +#' The \code{expr} allows the user to apply any function(s) on all +#' levels defined by \code{by.vars}. Here are some examples: +#' \itemize{ +#' \item .N or list(.N) is a function used inside a \code{data.table} to +#' calculate counts in each group +#' \item list(obs = .N), same as above but user assigned variable name +#' \item list(sum(obs), sum(pyrs), mean(dg_age)), multiple objects in a list +#' \item list(obs = sum(obs), pyrs = sum(pyrs)), same as above with user +#' defined variable names +#' } +#' +#' If \code{use.levels = FALSE}, no \code{levels} information will +#' be used. This means that if e.g. the \code{agegroup} +#' variable is a factor and has 18 levels defined, but only 15 levels +#' are present in the data, no rows for the missing +#' levels will be shown in the table. +#' +#' \code{na.rm} simply drops any rows from the resulting table where +#' any of the \code{by.vars} values was \code{NA}. +#' +#' @seealso +#' \code{\link{table}}, \code{\link{cast_simple}}, \code{\link{melt}} +#' +#' @export ltable +#' +#' @examples +#' data("sire", package = "popEpi") +#' sr <- sire +#' sr$agegroup <- cut(sr$dg_age, breaks=c(0,45,60,75,85,Inf)) +#' ## counts by default +#' ltable(sr, "agegroup") +#' +#' ## any expression can be given +#' ltable(sr, "agegroup", list(mage = mean(dg_age))) +#' ltable(sr, "agegroup", list(mage = mean(dg_age), vage = var(dg_age))) +#' +#' ## also returns levels where there are zero rows (expressions as NA) +#' ltable(sr, "agegroup", list(obs = .N, +#' minage = min(dg_age), +#' maxage = max(dg_age)), +#' subset = dg_age < 85) +#' +#' #### expr.by.cj +#' expr.by.cj(sr, "agegroup") +#' +#' ## any arbitrary expression can be given +#' expr.by.cj(sr, "agegroup", list(mage = mean(dg_age))) +#' expr.by.cj(sr, "agegroup", list(mage = mean(dg_age), vage = var(dg_age))) +#' +#' ## only uses levels of by.vars present in data +#' expr.by.cj(sr, "agegroup", list(mage = mean(dg_age), vage = var(dg_age)), +#' subset = dg_age < 70) +#' +#' ## .SDcols trick +#' expr.by.cj(sr, "agegroup", lapply(.SD, mean), +#' subset = dg_age < 70, .SDcols = c("dg_age", "status")) + +ltable <- function(data, + by.vars = NULL, + expr = list(obs = .N), + subset = NULL, + use.levels = TRUE, + na.rm = FALSE, + robust = TRUE) { + + PF <- parent.frame() + TF <- environment() + + e <- substitute(expr) + + ## eval subset --------------------------------------------------------------- + subset <- substitute(subset) + subset <- evalLogicalSubset(data, subset, enclos = PF) + + ## create table -------------------------------------------------------------- + res <- expr.by.cj(data = data, + by.vars = by.vars, + expr = e, + subset = subset, + use.levels = use.levels, + na.rm = na.rm, + robust = robust) + + + ## final touch --------------------------------------------------------------- + + if (!return_DT()) { + setDFpe(res) + } + res + +} + + + + +#' @describeIn ltable Somewhat more streamlined \code{ltable} with +#' defaults for speed. Explicit determination of enclosing environment +#' of data. +#' @export expr.by.cj + +expr.by.cj <- function(data, + by.vars = NULL, + expr = list(obs = .N), + subset = NULL, + use.levels = FALSE, + na.rm = FALSE, + robust = FALSE, + .SDcols = NULL, + enclos = parent.frame(1L), + ...) { + + PF <- enclos + TF <- environment() + + + ## checks -------------------------------------------------------------------- + if (!is.data.frame(data)) { + stop("Argument 'data' must be data.frame (data.table is fine too)") + } + + stopifnot(is.environment(enclos)) + stopifnot(is.logical(na.rm)) + stopifnot(is.logical(use.levels)) + + stopifnot(is.character(by.vars) || is.null(by.vars)) + all_names_present(data, c(by.vars)) + + stopifnot(is.character(.SDcols) || is.null(.SDcols)) + all_names_present(data, .SDcols) + + tab <- data.table(data[1:min(10, nrow(data)),]) + e <- substitute(expr) + e <- tab[, evalRecursive(e, env = .SD, enc = PF)$argSub] + + ## eval subset --------------------------------------------------------------- + subset <- substitute(subset) + subset <- evalLogicalSubset(data, subset, enclos = PF) + + ## retrieve data to use without taking copy ---------------------------------- + + tabVars <- unique(c(by.vars, all.vars(e), .SDcols)) + tabVars <- intersect(names(data), tabVars) + + tab <- mget(tabVars, envir = as.environment(data)) + setDT(tab) + + tmpDum <- makeTempVarName(data, pre = "dummy_") + if (!length(by.vars)) { + if (!length(tab)) { + ## no by.vars nor variables in expr + tab <- data.table(rep(1L, nrow(data))) + setnames(tab, "V1", tmpDum) + } else { + tab[, c(tmpDum) := 1L] + } + by.vars <- tmpDum + } + + ## create joining table ------------------------------------------------------ + lev_fun <- function(x) { + if (use.levels && is.factor(x)) { + factor(levels(x), levels = levels(x)) + } else { + sort(unique(x), na.last = TRUE) + } + } + cj <- lapply(as.list(tab)[by.vars], lev_fun) + cj <- do.call(CJ, c(cj, unique = FALSE, sorted = FALSE)) + if (na.rm) cj <- na.omit(cj) + + ## eval expression ----------------------------------------------------------- + tabe <- "tab[subset][cj, eval(e), + on = by.vars, + by = .EACHI, ..." + tabe <- if (is.null(.SDcols)) tabe else paste0(tabe, ", .SDcols = .SDcols") + tabe <- paste0(tabe ,"]") + res <- eval(parse(text = tabe)) + + setcolsnull(res, delete = tmpDum, soft = TRUE) + by.vars <- setdiff(by.vars, tmpDum) + + ## final touch --------------------------------------------------------------- + if (length(res)) setcolorder(res, c(by.vars, setdiff(names(res), by.vars))) + if (length(by.vars)) setkeyv(res, by.vars) + if (!return_DT()) { + setDFpe(res) + } + res + +} + diff --git a/R/mean_survival.R b/R/mean_survival.R index 3e20ac1..cc7420c 100644 --- a/R/mean_survival.R +++ b/R/mean_survival.R @@ -1,663 +1,663 @@ - - - -#' @title Compute Mean Survival Times Using Extrapolation -#' @description Computes mean survival times based on survival estimation up to -#' a point in follow-up time (e.g. 10 years), -#' after which survival is extrapolated -#' using an appropriate hazard data file (\code{pophaz}) to yield the "full" -#' survival curve. The area under the full survival curve is the mean survival. -#' @author Joonas Miettinen -#' @param formula a \code{formula}, e.g. \code{FUT ~ V1} or -#' \code{Surv(FUT, lex.Xst) ~ V1}. -#' Supplied in the same way as to \code{\link{survtab}}, see that help -#' for more info. -#' @param data a \code{Lexis} data set; see \code{\link[Epi]{Lexis}}. -#' @param adjust variables to adjust estimates by, e.g. \code{adjust = "agegr"}. -#' \link[=flexible_argument]{Flexible input}. -#' @param weights weights to use to adjust mean survival times. See the -#' \link[=direct_standardization]{dedicated help page} for more details on -#' weighting. \code{survmean} -#' computes curves separately by all variables to adjust by, computes mean -#' survival times, and computes weighted means of the mean survival times. -#' See Examples. -#' @param breaks a list of breaks defining the time window to compute -#' observed survival in, and the intervals used in estimation. E.g. -#' \code{list(FUT = 0:10)} when \code{FUT} is the follow-up time scale in your -#' data. -#' @param pophaz a data set of population hazards passed to -#' \code{\link{survtab}} (see the -#' \link[=pophaz]{dedicated help page} and the help page of -#' \code{survtab} for more information). Defines the -#' population hazard in the time window where observed survival is estimated. -#' @param e1.breaks \code{NULL} or a list of breaks defining the time -#' window to compute -#' \strong{expected} survival in, and the intervals used in estimation. E.g. -#' \code{list(FUT = 0:100)} when \code{FUT} is the follow-up time scale in your -#' data to extrapolate up to 100 years from where the observed survival -#' curve ends. \strong{NOTE:} the breaks on the survival time scale -#' MUST include the breaks supplied to argument \code{breaks}; see Examples. -#' If \code{NULL}, uses decent defaults (maximum follow-up time of 50 years). -#' @param e1.pophaz Same as \code{pophaz}, except this defines the -#' population hazard in the time window where \strong{expected} -#' survival is estimated. By default uses the same data as -#' argument \code{pophaz}. -#' @param r either a numeric multiplier such as \code{0.995}, \code{"auto"}, or -#' \code{"autoX"} where \code{X} is an integer; -#' used to determine the relative survival ratio (RSR) persisting after where -#' the estimated observed survival curve ends. See Details. -#' @param surv.method passed to \code{survtab}; see that help for more info. -#' @param subset a logical condition; e.g. \code{subset = sex == 1}; -#' subsets the data before computations -#' @param verbose \code{logical}; if \code{TRUE}, the function is returns -#' some messages and results along the run, which may be useful in debugging -#' @details -#' \strong{Basics} -#' -#' \code{survmean} computes mean survival times. For median survival times -#' (i.e. where 50 % of subjects have died or met some other event) -#' use \code{\link{survtab}}. -#' -#' The mean survival time is simply the area under the survival curve. -#' However, since full follow-up rarely happens, the observed survival curves -#' are extrapolated using expected survival: E.g. one might compute observed -#' survival till up to 10 years and extrapolate beyond that -#' (till e.g. 50 years) to yield an educated guess on the full observed survival -#' curve. -#' -#' The area is computed by trapezoidal integration of the area under the curve. -#' This function also computes the "full" expected survival curve from -#' T = 0 till e.g. T = 50 depending on supplied arguments. The -#' expected mean survival time is the area under the -#' mean expected survival curve. -#' This function returns the mean expected survival time to be compared with -#' the mean survival time and for computing years of potential life lost (YPLL). -#' -#' Results can be formed by strata and adjusted for e.g. age by using -#' the \code{formula} argument as in \code{survtab}. See also Examples. -#' -#' \strong{Extrapolation tweaks} -#' -#' Argument \code{r} controls the relative survival ratio (RSR) assumed to -#' persist beyond the time window where observed survival is computed -#' (defined by argument \code{breaks}; e.g. up to \code{FUT = 10}). -#' The RSR is simply \code{RSR_i = p_oi / p_ei} for a time interval \code{i}, -#' i.e. the observed divided by the expected -#' (conditional, not cumulative) probability of surviving from the beginning of -#' a time interval till its end. The cumulative product of \code{RSR_i} -#' over time is the (cumulative) relative survival curve. -#' -#' -#' If \code{r} is numeric, e.g. \code{r = 0.995}, that RSR level is assumed -#' to persist beyond the observed survival curve. -#' Numeric \code{r} should be \code{> 0} and expressed at the annual level -#' when using fractional years as the scale of the time variables. -#' E.g. if RSR is known to be \code{0.95} at the month level, then the -#' annualized RSR is \code{0.95^12}. This enables correct usage of the RSR -#' with survival intervals of varying lengths. When using day-level time -#' variables (such as \code{Dates}; see \code{as.Date}), numeric \code{r} -#' should be expressed at the day level, etc. -#' -#' If \code{r = "auto"} or \code{r = "auto1"}, this function computes -#' RSR estimates internally and automatically uses the \code{RSR_i} -#' in the last survival interval in each stratum (and adjusting group) -#' and assumes that to persist beyond the observed survival curve. -#' Automatic determination of \code{r} is a good starting point, -#' but in situations where the RSR estimate is uncertain it may produce poor -#' results. Using \code{"autoX"} such as \code{"auto6"} causes \code{survmean} -#' to use the mean of the estimated RSRs in the last X survival intervals, -#' which may be more stable. -#' Automatic determination will not use values \code{>1} but set them to 1. -#' Visual inspection of the produced curves is always recommended: see -#' Examples. -#' -#' One may also tweak the accuracy and length of extrapolation and -#' expected survival curve computation by using -#' \code{e1.breaks}. By default this is whatever was supplied to \code{breaks} -#' for the survival time scale, to which -#' -#' \code{c(seq(1/12, 1, 1/12), seq(1.2, 1.8, 0.2), 2:19, seq(20, 50, 5))} -#' -#' is added after the maximum value, e.g. with \code{breaks = list(FUT = 0:10)} -#' we have -#' -#' \code{..., 10+1/12, ..., 11, 11.2, ..., 2, 3, ..., 19, 20, 25, ... 50} -#' -#' as the \code{e1.breaks}. Supplying \code{e1.breaks} manually requires -#' the breaks over time survival time scale supplied to argument \code{breaks} -#' to be reiterated in \code{e1.breaks}; see Examples. \strong{NOTE}: the -#' default extrapolation breaks assume the time scales in the data to be -#' expressed as fractional years, meaning this will work extremely poorly -#' when using e.g. day-level time scales (such as \code{Date} variables). -#' Set the extrapolation breaks manually in such cases. -#' -#' @return -#' Returns a \code{data.frame} or \code{data.table} (depending on -#' \code{getOptions("popEpi.datatable")}; see \code{?popEpi}) containing the -#' following columns: -#' \itemize{ -#' \item{est}{: The estimated mean survival time} -#' \item{exp}{: The computed expected survival time} -#' \item{obs}{: Counts of subjects in data} -#' \item{YPLL}{: Years of Potential Life Lost, computed as -#' (\code{(exp-est)*obs}) - though your time data may be in e.g. days, -#' this column will have the same name regardless.} -#' } -#' The returned data also has columns named according to the variables -#' supplied to the right-hand-side of the formula. -#' -#' -#' @examples -#' -#' library(survival) -#' library(Epi) -#' ## take 500 subjects randomly for demonstration -#' data(sire) -#' sire <- sire[sire$dg_date < sire$ex_date, ] -#' set.seed(1L) -#' sire <- sire[sample(x = nrow(sire), size = 500),] -#' -#' ## NOTE: recommended to use factor status variable -#' x <- Lexis(entry = list(FUT = 0, AGE = dg_age, CAL = get.yrs(dg_date)), -#' exit = list(CAL = get.yrs(ex_date)), -#' data = sire, -#' exit.status = factor(status, levels = 0:2, -#' labels = c("alive", "canD", "othD")), -#' merge = TRUE) -#' -#' ## phony variable -#' set.seed(1L) -#' x$group <- rbinom(nrow(x), 1, 0.5) -#' ## age group -#' x$agegr <- cut(x$dg_age, c(0,45,60,Inf), right=FALSE) -#' -#' ## population hazards data set -#' pm <- data.frame(popEpi::popmort) -#' names(pm) <- c("sex", "CAL", "AGE", "haz") -#' -#' ## breaks to define observed survival estimation -#' BL <- list(FUT = seq(0, 10, 1/12)) -#' -#' ## crude mean survival -#' sm1 <- survmean(Surv(FUT, lex.Xst != "alive") ~ 1, -#' pophaz = pm, data = x, weights = NULL, -#' breaks = BL) -#' -#' sm1 <- survmean(FUT ~ 1, -#' pophaz = pm, data = x, weights = NULL, -#' breaks = BL) -#' \dontrun{ -#' ## mean survival by group -#' sm2 <- survmean(FUT ~ group, -#' pophaz = pm, data = x, weights = NULL, -#' breaks = BL) -#' -#' ## ... and adjusted for age using internal weights (counts of subjects) -#' ## note: need also longer extrapolation here so that all curves -#' ## converge to zero in the end. -#' eBL <- list(FUT = c(BL$FUT, 11:75)) -#' sm3 <- survmean(FUT ~ group + adjust(agegr), -#' pophaz = pm, data = x, weights = "internal", -#' breaks = BL, e1.breaks = eBL) -#' } - -#' ## visual inspection of how realistic extrapolation is for each stratum; -#' ## solid lines are observed + extrapolated survivals; -#' ## dashed lines are expected survivals -#' plot(sm1) -#' \dontrun{ -#' ## plotting object with both stratification and standardization -#' ## plots curves for each strata-std.group combination -#' plot(sm3) -#' -#' ## for finer control of plotting these curves, you may extract -#' ## from the survmean object using e.g. -#' attributes(sm3)$survmean.meta$curves -#' -#' -#' #### using Dates -#' -#' x <- Lexis(entry = list(FUT = 0L, AGE = dg_date-bi_date, CAL = dg_date), -#' exit = list(CAL = ex_date), -#' data = sire[sire$dg_date < sire$ex_date, ], -#' exit.status = factor(status, levels = 0:2, -#' labels = c("alive", "canD", "othD")), -#' merge = TRUE) -#' ## phony group variable -#' set.seed(1L) -#' x$group <- rbinom(nrow(x), 1, 0.5) -#' -#' -#' ## NOTE: population hazard should be reported at the same scale -#' ## as time variables in your Lexis data. -#' data(popmort, package = "popEpi") -#' pm <- data.frame(popmort) -#' names(pm) <- c("sex", "CAL", "AGE", "haz") -#' ## from year to day level -#' pm$haz <- pm$haz/365.25 -#' pm$CAL <- as.Date(paste0(pm$CAL, "-01-01")) -#' pm$AGE <- pm$AGE*365.25 -#' -#' BL <- list(FUT = seq(0, 8, 1/12)*365.25) -#' eBL <- list(FUT = c(BL$FUT, c(8.25,8.5,9:60)*365.25)) -#' smd <- survmean(FUT ~ group, data = x, -#' pophaz = pm, verbose = TRUE, r = "auto5", -#' breaks = BL, e1.breaks = eBL) -#' plot(smd) -#' } -#' - -#' -#' @export -#' @family survmean functions -#' @family main functions -#' - -survmean <- function(formula, data, adjust = NULL, weights = NULL, - breaks=NULL, pophaz = NULL, - e1.breaks = NULL, e1.pophaz = pophaz, r = "auto", - surv.method = "hazard", subset = NULL, verbose = FALSE) { - pt <- proc.time() - TF__ <- environment() - PF__ <- parent.frame(1L) - - attr_form <- copy(formula) - - surv.method <- match.arg(surv.method, c("hazard", "lifetable")) - - if(!requireNamespace("survival")) { - stop("Need package 'survival' to proceed") - } - - ## appease R CMD CHECK (due to using vars in DT[] only) - r.e2 <- last.p.e2 <- surv <- survmean_type <- est <- Tstart <- Tstop <- - lex.id <- surv.int <- delta <- surv.exp <- obs <- NULL - - checkLexisData(data, check.breaks = FALSE) - checkPophaz(data, pophaz, haz.name = "haz") - checkPophaz(data, e1.pophaz, haz.name = "haz") - pophaz <- setDT(copy(pophaz)) - e1.pophaz <- setDT(copy(e1.pophaz)) - - if (is.numeric(r) && r < 0L) stop("numeric r must be > 0, e.g. r = 0.95") - if (is.character(r)) { - if (substr(r, 1, 4) != "auto") { - stop("character string r must start with 'auto'; e.g. `auto` and ", - "`auto5` are accepted.") - } - if (r == "auto") r <- "auto1" - - auto_ints <- regmatches(r, regexec("\\d+", text = r)) - auto_ints <- as.integer(auto_ints) - r <- "auto" - } - - tscales_all <- attr(data, "time.scales") - breaks_old <- attr(data, "breaks") - - - - ## breaks -------------------------------------------------------------------- - - if (!is.null(breaks_old)) checkBreaksList(data, breaks_old) - if (is.null(breaks)) breaks <- breaks_old - - checkBreaksList(data, breaks) - - ## hmm - will later on set breaks on the found survival scale - if (!is.null(e1.breaks)) checkBreaksList(data, e1.breaks) - - ## prep & subset data -------------------------------------------------------- - subset <- substitute(subset) - subset <- evalLogicalSubset(data, subset) - - x <- setDT(data[subset, ]) - forceLexisDT(x, breaks = breaks_old, allScales = tscales_all) - - ## ensure variables to merge pophaz datas by are kept ------------------------ - ## NOTE: temp var names avoid conflicts down the line - avoid <- unique(c(names(data), names(x), names(pophaz), names(e1.pophaz))) - - pophaz_vars <- c(names(pophaz), names(e1.pophaz)) - pophaz_vars <- setdiff(pophaz_vars, c(tscales_all, "haz")) - pophaz_vars <- intersect(pophaz_vars, names(x)) - pophaz_vars_tmp <- makeTempVarName(names = avoid, pre = pophaz_vars) - if (!length(pophaz_vars)) { - pophaz_vars_tmp <- NULL - } else { - pophaz_vars_wh <- which(pophaz_vars %in% names(pophaz)) - if (sum(pophaz_vars_wh)) { - setnames(pophaz, old = pophaz_vars[pophaz_vars_wh], - new = pophaz_vars_tmp[pophaz_vars_wh]) - } - pophaz_vars_wh <- which(pophaz_vars %in% names(e1.pophaz)) - if (sum(pophaz_vars_wh)) { - setnames(e1.pophaz, old = pophaz_vars[pophaz_vars_wh], - new = pophaz_vars_tmp[pophaz_vars_wh]) - } - x[, (pophaz_vars_tmp) := copy(.SD), .SDcols = pophaz_vars] - } - - ## determine printing & adjusting vars --------------------------------------- - adSub <- substitute(adjust) - foList <- usePopFormula(formula, adjust = adSub, data = x, enclos = PF__, - Surv.response = "either") - - ## will avoid conflicts using temp names for tabulating variables - adjust_vars <- names(foList$adjust) - print_vars <- names(foList$print) - by_vars <- c(print_vars, adjust_vars) - - avoid <- unique(c(names(data), names(x), names(pophaz), names(e1.pophaz))) - adjust_vars_tmp <- makeTempVarName(names = avoid, pre = adjust_vars) - if (!length(adjust_vars)) adjust_vars_tmp <- NULL - avoid <- unique(c(names(data), names(x), names(pophaz), names(e1.pophaz))) - print_vars_tmp <- makeTempVarName(names = avoid, pre = print_vars) - if (!length(print_vars)) print_vars_tmp <- NULL - by_vars_tmp <- c(print_vars_tmp, adjust_vars_tmp) - - - lex_vars <- c("lex.id", tscales_all, "lex.dur", "lex.Cst", "lex.Xst") - setcolsnull(x, keep = c(lex_vars, pophaz_vars_tmp), soft = FALSE) - if (length(adjust_vars) > 0L) x[, (adjust_vars_tmp) := foList$adjust] - if (length(print_vars) > 0L) x[, (print_vars_tmp) := foList$print] - - ## formula for survtab: we estimate survivals by all levels of both - ## print and adjust; adjusting here means computing directly adjusted - ## estimates of the mean survival time, so mean survival times are - ## weighted later on. - - formula <- paste0(deparse(formula[[2L]]), " ~ ") - if (length(c(adjust_vars_tmp, print_vars_tmp)) > 0L) { - formula <- paste0(formula, paste0(c(print_vars_tmp, adjust_vars_tmp), - collapse = " + ")) - } else { - formula <- paste0(formula, "1") - } - formula <- as.formula(formula) - - ## detect survival time scale ------------------------------------------------ - tscale_surv <- detectSurvivalTimeScale(lex = x, values = foList$y$time) - - ## check weights & adjust ---------------------------------------------------- - test_obs <- x[, .(obs=.N), keyby=eval(TF__$by_vars_tmp)] - if (length(by_vars)) setnames(test_obs, by_vars_tmp, by_vars) - if (length(weights) && !length(adjust_vars)) { - weights <- NULL - warning("Replaced weights with NULL due to not supplying variables to ", - "adjust by.") - } - mwDTtest <- makeWeightsDT(test_obs, values = list("obs"), print = print_vars, - adjust = adjust_vars, weights = weights, - internal.weights.values = "obs") - if (length(by_vars)) setnames(test_obs, by_vars, by_vars_tmp) - - ## figure out extrapolation breaks ------------------------------------------- - ## now that the survival time scale is known this can actually be done. - - if (is.null(e1.breaks)) { - e1.breaks <- copy(breaks[tscale_surv]) - addBreaks <- max(e1.breaks[[tscale_surv]]) + - c(seq(0,1,1/12), seq(1.2, 1.8, 0.2), 2:19, seq(20, 50, 5)) - e1.breaks[[tscale_surv]] <- unique(c(e1.breaks[[tscale_surv]], addBreaks)) - - checkBreaksList(x, e1.breaks) - } - if (!tscale_surv %in% names(e1.breaks)) { - stop("The survival time scale must be included in the list of breaks ", - "to extrapolate by ('e1.breaks').") - } - if (!all(breaks[[tscale_surv]] %in% e1.breaks[[tscale_surv]])) { - stop("The vector of breaks in 'breaks' for the survival time scale MUST", - "be a subset of the breaks for the survival time scale in ", - "'e1.breaks'. E.g. the former could be 0:10 and the latter 0:100.") - } - - if (verbose) { - cat("Time taken by prepping data:", timetaken(pt), "\n") - } - - - ## compute observed survivals ------------------------------------------------ - ## NOTE: do not adjust here; adjust in original formula means weighting - ## the mean survival time results. - - st <- survtab(formula, data = x, breaks = breaks, - pophaz = pophaz, - relsurv.method = "e2", - surv.type = "surv.rel", - surv.method = surv.method) - - st_keep_vars <- c(by_vars_tmp, "Tstop", "r.e2", "surv.obs") - all_names_present( - st, st_keep_vars, - msg = paste0("Internal error: expected to have variables ", - "%%VARS%% after computing observed survivals ", - "but didn't. Blame the package maintainer if you ", - "see this.") - ) - setcolsnull(st, keep = st_keep_vars, colorder = TRUE) - setDT(st) - setkeyv(st, c(by_vars_tmp, "Tstop")) - st[, "Tstart" := c(0, Tstop[-.N]), by = eval(by_vars_tmp)] - - ## decumulate for later cumulation - st[, c("r.e2", "surv.obs") := lapply(.SD, function(col) col/c(1, col[-.N])), - by = eval(by_vars_tmp), - .SDcols = c("r.e2", "surv.obs") - ] - - - if (verbose) { - cat("Time taken by estimating relative survival curves:", - timetaken(pt), "\n") - } - - ## compute overall expected survival ----------------------------------------- - ## 1) take only those individuals that were diagnosed in the time window - ## defined by breaks list in argument 'breaks' - pt <- proc.time() - setkeyv(x, c("lex.id", tscale_surv)) - tol <- .Machine$double.eps^0.5 - xe <- unique(x, by = key(x))[x[[tscale_surv]] < TF__$tol, ] ## pick rows with entry to FU - - if (length(breaks) > 1L) { - ## e.g. a period window was defined and we only use subjects - ## entering follow-up in the time window. - breaks_drop_tmp <- setdiff(names(breaks), tscale_surv) - breaks_drop_tmp <- breaks[breaks_drop_tmp] - breaks_drop_tmp <- lapply(breaks_drop_tmp, range) - - expr <- mapply(function(ch, ra) { - paste0("between(", ch, ", ", ra[1], ", ", ra[2] - tol, ", incbounds = TRUE)") - }, ch = names(breaks_drop_tmp), ra = breaks_drop_tmp, SIMPLIFY = FALSE) - - expr <- lapply(expr, function(e) eval(parse(text = e), envir = xe)) - setDT(expr) - expr <- expr[, rowSums(.SD)] == ncol(expr) - xe <- xe[expr, ] - } - - xe <- x[lex.id %in% unique(xe[["lex.id"]])] - forceLexisDT(xe, breaks = breaks_old, allScales = tscales_all, key = FALSE) - - ## 2) compute Ederer I expected survival curves from T = 0 till e.g. T = 100 - e1 <- comp_e1(xe, breaks = e1.breaks, pophaz = e1.pophaz, immortal = TRUE, - survScale = tscale_surv, by = by_vars_tmp, id = "lex.id") - setnames(e1, tscale_surv, "Tstop") - e1[, "Tstart" := c(0, Tstop[-.N]), by = eval(by_vars_tmp)] - e1[, "surv.int" := cut(Tstart, breaks = e1.breaks[[tscale_surv]], - right = FALSE, labels = FALSE)] - e1[, "delta" := Tstop - Tstart] - - ## decumulate for later cumulation - e1[, "surv.exp" := surv.exp/c(1, surv.exp[-.N]), by = eval(by_vars_tmp)] - - if (verbose) { - cat("Time taken by computing overall expected survival curves:", - timetaken(pt), "\n") - } - - ## compute counts of subjects ------------------------------------------------ - ## these correspond to the counts of patients for which expected survival - ## was computed. If observed survival is e.g. a period estimated curve, - ## we only use subjects entering follow-up in the period window. - N_subjects <- xe[!duplicated(lex.id)][, - list(obs=.N), - keyby=eval(by_vars_tmp) - ] - - ## combine all estimates into one data set ----------------------------------- - pt <- proc.time() - - st[, "surv.int" := cut(Tstart, breaks = e1.breaks[[tscale_surv]], - right = FALSE, labels = FALSE)] - - x <- merge(e1, st[, .SD, .SDcols = c(by_vars_tmp, "surv.int", "r.e2", "surv.obs")], - by = c(by_vars_tmp,"surv.int"), all = TRUE) - setkeyv(x, c(by_vars_tmp, "surv.int")) - - ## extrapolation RSR definition ---------------------------------------------- - if (is.numeric(r)) { - ## manually given RSR for extrapolated part of the obs.surv curve - ## here it is assumed that r is annualized - set(x, j = "last.p.e2", value = r^x[["delta"]]) - - - } else { - ## add last non-NA values as separate column - - st <- st[, .SD[(.N-TF__$auto_ints+1):.N], by = eval(by_vars_tmp)] - - st[, "delta" := Tstop - Tstart] - st[, "r.e2" := r.e2^(1/delta)] ## "annualized" RSRs - - ## mean annualized RSR in last N intervas by strata - st <- st[, .(last.p.e2 = mean(r.e2)), by = eval(by_vars_tmp)] - st[, "last.p.e2" := pmin(1, last.p.e2)] - if (verbose) { - cat("Using following table of mean RSR estimates", - "(scaled to RSRs applicable to a time interval one", - "unit of time wide, e.g. one year or one day)", - "based on", auto_ints, "interval(s) from the end of the relative", - "survival curve by strata: \n") - prST <- data.table(st) - setnames(prST, c(by_vars_tmp, "last.p.e2"), c(by_vars, "RSR")) - print(prST) - } - - if (length(by_vars_tmp)) { - x <- merge(x, st, by = by_vars_tmp, all = TRUE) - } else { - set(x, j = "last.p.e2", value = st$last.p.e2) - } - x[, "last.p.e2" := last.p.e2^(delta)] ## back to non-annualized RSRs - ## enforce RSR in extrapolated part of observed curve to at most 1 - x[, "last.p.e2" := pmin(last.p.e2, 1)] - } - - x[is.na(r.e2), "r.e2" := last.p.e2] - x[, "surv" := r.e2*surv.exp] - # setnames(x, "surv.obs", "surv") - # x[is.na(surv), "surv" := surv.exp*last.p.e2] - - ## cumulate again - setkeyv(x, c(by_vars_tmp, "surv.int")) - x[, c("surv", "surv.exp") := lapply(.SD, cumprod), - .SDcols = c("surv", "surv.exp"), by = eval(by_vars_tmp)] - - x2 <- copy(x) - x[, "surv.exp" := NULL] - x2[, "surv" := NULL] - setnames(x2, "surv.exp", "surv") - x <- rbind(x, x2) - x[, "survmean_type" := rep(c("est", "exp"), each = nrow(x2))] - - setcolsnull( - x, - keep = c(by_vars_tmp, "survmean_type", - "surv.int", "Tstart", "Tstop", - "delta", "surv", "surv.exp"), - colorder = TRUE - ) - - ## check curve convergence to zero ------------------------------------------- - ## a good integration is based on curves that get very close to - ## zero in the end - mi <- x[, .(surv = round(min(surv),4)*100), - keyby = eval(c(by_vars_tmp, "survmean_type"))] - - if (any(mi$surv > 1)) { - warning("One or several of the curves used to compute mean survival times ", - "or expected mean survival times was > 1 % at the lowest point. ", - "Mean survival estimates may be significantly biased. To avoid ", - "this, supply breaks to 'e1.breaks' which make the curves longer ", - ", e.g. e1.breaks = list(FUT = 0:150) where time scale FUT ", - "is the survival time scale (yours may have a different name).") - } - mi[, "surv" := paste0(formatC(surv, digits = 2, format = "f"), " %")] - mi[, "survmean_type" := factor(survmean_type, c("est", "exp"), - c("Observed", "Expected"))] - setnames(mi, c("survmean_type", "surv"), - c("Obs./Exp. curve", "Lowest value")) - if (length(by_vars)) setnames(mi, by_vars_tmp, by_vars) - if (verbose) { - cat("Lowest points in observed / expected survival curves by strata:\n") - print(mi) - } - - ## integrating by trapezoid areas -------------------------------------------- - ## trapezoid area: WIDTH*(HEIGHT1 + HEIGHT2)/2 - ## so we compute "average interval survivals" for each interval t_i - ## and multiply with interval length. - - setkeyv(x, c(by_vars_tmp, "survmean_type", "Tstop")) - sm <- x[, .(survmean = sum(delta*(surv + c(1, surv[-.N]))/2L)), - keyby = c(by_vars_tmp, "survmean_type")] - - ## cast ---------------------------------------------------------------------- - - sm <- cast_simple(sm, columns = "survmean_type", - rows = by_vars_tmp, values = "survmean") - - ## add numbers of subjects, compute YPLL ------------------------------------- - setkeyv(sm, by_vars_tmp); setkeyv(N_subjects, by_vars_tmp) - sm[, "obs" := N_subjects$obs] - sm[, "YPLL" := (exp-est)*obs] - - - ## adjusting ----------------------------------------------------------------- - - sm <- makeWeightsDT(sm, values = list(c("est", "exp", "obs", "YPLL")), - print = print_vars_tmp, adjust = adjust_vars_tmp, - weights = weights, internal.weights.values = "obs") - if (length(adjust_vars)) { - vv <- c("est", "exp", "obs", "YPLL") - sm[, c("est", "exp") := lapply(.SD, function(col) col*sm$weights), - .SDcols = c("est", "exp")] - sm <- sm[, lapply(.SD, sum), .SDcols = vv, by = eval(print_vars_tmp)] - } - - if (verbose) { - cat("Time taken by final touches:", timetaken(pt), "\n") - } - - ## final touch --------------------------------------------------------------- - if (length(print_vars)) setnames(sm, print_vars_tmp, print_vars) - - at <- list(call = match.call(), - formula = attr_form, - print = print_vars, - adjust = adjust_vars, - tprint = print_vars_tmp, - tadjust = adjust_vars_tmp, - breaks = breaks, - e1.breaks = e1.breaks, - survScale = tscale_surv, - curves = copy(x)) - setattr(sm, "class", c("survmean","data.table", "data.frame")) - setattr(sm, "survmean.meta", at) - if (!return_DT()) setDFpe(sm) - return(sm[]) -} - + + + +#' @title Compute Mean Survival Times Using Extrapolation +#' @description Computes mean survival times based on survival estimation up to +#' a point in follow-up time (e.g. 10 years), +#' after which survival is extrapolated +#' using an appropriate hazard data file (\code{pophaz}) to yield the "full" +#' survival curve. The area under the full survival curve is the mean survival. +#' @author Joonas Miettinen +#' @param formula a \code{formula}, e.g. \code{FUT ~ V1} or +#' \code{Surv(FUT, lex.Xst) ~ V1}. +#' Supplied in the same way as to \code{\link{survtab}}, see that help +#' for more info. +#' @param data a \code{Lexis} data set; see \code{\link[Epi]{Lexis}}. +#' @param adjust variables to adjust estimates by, e.g. \code{adjust = "agegr"}. +#' \link[=flexible_argument]{Flexible input}. +#' @param weights weights to use to adjust mean survival times. See the +#' \link[=direct_standardization]{dedicated help page} for more details on +#' weighting. \code{survmean} +#' computes curves separately by all variables to adjust by, computes mean +#' survival times, and computes weighted means of the mean survival times. +#' See Examples. +#' @param breaks a list of breaks defining the time window to compute +#' observed survival in, and the intervals used in estimation. E.g. +#' \code{list(FUT = 0:10)} when \code{FUT} is the follow-up time scale in your +#' data. +#' @param pophaz a data set of population hazards passed to +#' \code{\link{survtab}} (see the +#' \link[=pophaz]{dedicated help page} and the help page of +#' \code{survtab} for more information). Defines the +#' population hazard in the time window where observed survival is estimated. +#' @param e1.breaks \code{NULL} or a list of breaks defining the time +#' window to compute +#' \strong{expected} survival in, and the intervals used in estimation. E.g. +#' \code{list(FUT = 0:100)} when \code{FUT} is the follow-up time scale in your +#' data to extrapolate up to 100 years from where the observed survival +#' curve ends. \strong{NOTE:} the breaks on the survival time scale +#' MUST include the breaks supplied to argument \code{breaks}; see Examples. +#' If \code{NULL}, uses decent defaults (maximum follow-up time of 50 years). +#' @param e1.pophaz Same as \code{pophaz}, except this defines the +#' population hazard in the time window where \strong{expected} +#' survival is estimated. By default uses the same data as +#' argument \code{pophaz}. +#' @param r either a numeric multiplier such as \code{0.995}, \code{"auto"}, or +#' \code{"autoX"} where \code{X} is an integer; +#' used to determine the relative survival ratio (RSR) persisting after where +#' the estimated observed survival curve ends. See Details. +#' @param surv.method passed to \code{survtab}; see that help for more info. +#' @param subset a logical condition; e.g. \code{subset = sex == 1}; +#' subsets the data before computations +#' @param verbose \code{logical}; if \code{TRUE}, the function is returns +#' some messages and results along the run, which may be useful in debugging +#' @details +#' \strong{Basics} +#' +#' \code{survmean} computes mean survival times. For median survival times +#' (i.e. where 50 % of subjects have died or met some other event) +#' use \code{\link{survtab}}. +#' +#' The mean survival time is simply the area under the survival curve. +#' However, since full follow-up rarely happens, the observed survival curves +#' are extrapolated using expected survival: E.g. one might compute observed +#' survival till up to 10 years and extrapolate beyond that +#' (till e.g. 50 years) to yield an educated guess on the full observed survival +#' curve. +#' +#' The area is computed by trapezoidal integration of the area under the curve. +#' This function also computes the "full" expected survival curve from +#' T = 0 till e.g. T = 50 depending on supplied arguments. The +#' expected mean survival time is the area under the +#' mean expected survival curve. +#' This function returns the mean expected survival time to be compared with +#' the mean survival time and for computing years of potential life lost (YPLL). +#' +#' Results can be formed by strata and adjusted for e.g. age by using +#' the \code{formula} argument as in \code{survtab}. See also Examples. +#' +#' \strong{Extrapolation tweaks} +#' +#' Argument \code{r} controls the relative survival ratio (RSR) assumed to +#' persist beyond the time window where observed survival is computed +#' (defined by argument \code{breaks}; e.g. up to \code{FUT = 10}). +#' The RSR is simply \code{RSR_i = p_oi / p_ei} for a time interval \code{i}, +#' i.e. the observed divided by the expected +#' (conditional, not cumulative) probability of surviving from the beginning of +#' a time interval till its end. The cumulative product of \code{RSR_i} +#' over time is the (cumulative) relative survival curve. +#' +#' +#' If \code{r} is numeric, e.g. \code{r = 0.995}, that RSR level is assumed +#' to persist beyond the observed survival curve. +#' Numeric \code{r} should be \code{> 0} and expressed at the annual level +#' when using fractional years as the scale of the time variables. +#' E.g. if RSR is known to be \code{0.95} at the month level, then the +#' annualized RSR is \code{0.95^12}. This enables correct usage of the RSR +#' with survival intervals of varying lengths. When using day-level time +#' variables (such as \code{Dates}; see \code{as.Date}), numeric \code{r} +#' should be expressed at the day level, etc. +#' +#' If \code{r = "auto"} or \code{r = "auto1"}, this function computes +#' RSR estimates internally and automatically uses the \code{RSR_i} +#' in the last survival interval in each stratum (and adjusting group) +#' and assumes that to persist beyond the observed survival curve. +#' Automatic determination of \code{r} is a good starting point, +#' but in situations where the RSR estimate is uncertain it may produce poor +#' results. Using \code{"autoX"} such as \code{"auto6"} causes \code{survmean} +#' to use the mean of the estimated RSRs in the last X survival intervals, +#' which may be more stable. +#' Automatic determination will not use values \code{>1} but set them to 1. +#' Visual inspection of the produced curves is always recommended: see +#' Examples. +#' +#' One may also tweak the accuracy and length of extrapolation and +#' expected survival curve computation by using +#' \code{e1.breaks}. By default this is whatever was supplied to \code{breaks} +#' for the survival time scale, to which +#' +#' \code{c(seq(1/12, 1, 1/12), seq(1.2, 1.8, 0.2), 2:19, seq(20, 50, 5))} +#' +#' is added after the maximum value, e.g. with \code{breaks = list(FUT = 0:10)} +#' we have +#' +#' \code{..., 10+1/12, ..., 11, 11.2, ..., 2, 3, ..., 19, 20, 25, ... 50} +#' +#' as the \code{e1.breaks}. Supplying \code{e1.breaks} manually requires +#' the breaks over time survival time scale supplied to argument \code{breaks} +#' to be reiterated in \code{e1.breaks}; see Examples. \strong{NOTE}: the +#' default extrapolation breaks assume the time scales in the data to be +#' expressed as fractional years, meaning this will work extremely poorly +#' when using e.g. day-level time scales (such as \code{Date} variables). +#' Set the extrapolation breaks manually in such cases. +#' +#' @return +#' Returns a \code{data.frame} or \code{data.table} (depending on +#' \code{getOptions("popEpi.datatable")}; see \code{?popEpi}) containing the +#' following columns: +#' \itemize{ +#' \item{est}{: The estimated mean survival time} +#' \item{exp}{: The computed expected survival time} +#' \item{obs}{: Counts of subjects in data} +#' \item{YPLL}{: Years of Potential Life Lost, computed as +#' (\code{(exp-est)*obs}) - though your time data may be in e.g. days, +#' this column will have the same name regardless.} +#' } +#' The returned data also has columns named according to the variables +#' supplied to the right-hand-side of the formula. +#' +#' +#' @examples +#' +#' library(survival) +#' library(Epi) +#' ## take 500 subjects randomly for demonstration +#' data(sire) +#' sire <- sire[sire$dg_date < sire$ex_date, ] +#' set.seed(1L) +#' sire <- sire[sample(x = nrow(sire), size = 500),] +#' +#' ## NOTE: recommended to use factor status variable +#' x <- Lexis(entry = list(FUT = 0, AGE = dg_age, CAL = get.yrs(dg_date)), +#' exit = list(CAL = get.yrs(ex_date)), +#' data = sire, +#' exit.status = factor(status, levels = 0:2, +#' labels = c("alive", "canD", "othD")), +#' merge = TRUE) +#' +#' ## phony variable +#' set.seed(1L) +#' x$group <- rbinom(nrow(x), 1, 0.5) +#' ## age group +#' x$agegr <- cut(x$dg_age, c(0,45,60,Inf), right=FALSE) +#' +#' ## population hazards data set +#' pm <- data.frame(popEpi::popmort) +#' names(pm) <- c("sex", "CAL", "AGE", "haz") +#' +#' ## breaks to define observed survival estimation +#' BL <- list(FUT = seq(0, 10, 1/12)) +#' +#' ## crude mean survival +#' sm1 <- survmean(Surv(FUT, lex.Xst != "alive") ~ 1, +#' pophaz = pm, data = x, weights = NULL, +#' breaks = BL) +#' +#' sm1 <- survmean(FUT ~ 1, +#' pophaz = pm, data = x, weights = NULL, +#' breaks = BL) +#' \dontrun{ +#' ## mean survival by group +#' sm2 <- survmean(FUT ~ group, +#' pophaz = pm, data = x, weights = NULL, +#' breaks = BL) +#' +#' ## ... and adjusted for age using internal weights (counts of subjects) +#' ## note: need also longer extrapolation here so that all curves +#' ## converge to zero in the end. +#' eBL <- list(FUT = c(BL$FUT, 11:75)) +#' sm3 <- survmean(FUT ~ group + adjust(agegr), +#' pophaz = pm, data = x, weights = "internal", +#' breaks = BL, e1.breaks = eBL) +#' } + +#' ## visual inspection of how realistic extrapolation is for each stratum; +#' ## solid lines are observed + extrapolated survivals; +#' ## dashed lines are expected survivals +#' plot(sm1) +#' \dontrun{ +#' ## plotting object with both stratification and standardization +#' ## plots curves for each strata-std.group combination +#' plot(sm3) +#' +#' ## for finer control of plotting these curves, you may extract +#' ## from the survmean object using e.g. +#' attributes(sm3)$survmean.meta$curves +#' +#' +#' #### using Dates +#' +#' x <- Lexis(entry = list(FUT = 0L, AGE = dg_date-bi_date, CAL = dg_date), +#' exit = list(CAL = ex_date), +#' data = sire[sire$dg_date < sire$ex_date, ], +#' exit.status = factor(status, levels = 0:2, +#' labels = c("alive", "canD", "othD")), +#' merge = TRUE) +#' ## phony group variable +#' set.seed(1L) +#' x$group <- rbinom(nrow(x), 1, 0.5) +#' +#' +#' ## NOTE: population hazard should be reported at the same scale +#' ## as time variables in your Lexis data. +#' data(popmort, package = "popEpi") +#' pm <- data.frame(popmort) +#' names(pm) <- c("sex", "CAL", "AGE", "haz") +#' ## from year to day level +#' pm$haz <- pm$haz/365.25 +#' pm$CAL <- as.Date(paste0(pm$CAL, "-01-01")) +#' pm$AGE <- pm$AGE*365.25 +#' +#' BL <- list(FUT = seq(0, 8, 1/12)*365.25) +#' eBL <- list(FUT = c(BL$FUT, c(8.25,8.5,9:60)*365.25)) +#' smd <- survmean(FUT ~ group, data = x, +#' pophaz = pm, verbose = TRUE, r = "auto5", +#' breaks = BL, e1.breaks = eBL) +#' plot(smd) +#' } +#' + +#' +#' @export +#' @family survmean functions +#' @family main functions +#' + +survmean <- function(formula, data, adjust = NULL, weights = NULL, + breaks=NULL, pophaz = NULL, + e1.breaks = NULL, e1.pophaz = pophaz, r = "auto", + surv.method = "hazard", subset = NULL, verbose = FALSE) { + pt <- proc.time() + TF__ <- environment() + PF__ <- parent.frame(1L) + + attr_form <- copy(formula) + + surv.method <- match.arg(surv.method, c("hazard", "lifetable")) + + if(!requireNamespace("survival")) { + stop("Need package 'survival' to proceed") + } + + ## appease R CMD CHECK (due to using vars in DT[] only) + r.e2 <- last.p.e2 <- surv <- survmean_type <- est <- Tstart <- Tstop <- + lex.id <- surv.int <- delta <- surv.exp <- obs <- NULL + + checkLexisData(data, check.breaks = FALSE) + checkPophaz(data, pophaz, haz.name = "haz") + checkPophaz(data, e1.pophaz, haz.name = "haz") + pophaz <- setDT(copy(pophaz)) + e1.pophaz <- setDT(copy(e1.pophaz)) + + if (is.numeric(r) && r < 0L) stop("numeric r must be > 0, e.g. r = 0.95") + if (is.character(r)) { + if (substr(r, 1, 4) != "auto") { + stop("character string r must start with 'auto'; e.g. `auto` and ", + "`auto5` are accepted.") + } + if (r == "auto") r <- "auto1" + + auto_ints <- regmatches(r, regexec("\\d+", text = r)) + auto_ints <- as.integer(auto_ints) + r <- "auto" + } + + tscales_all <- attr(data, "time.scales") + breaks_old <- attr(data, "breaks") + + + + ## breaks -------------------------------------------------------------------- + + if (!is.null(breaks_old)) checkBreaksList(data, breaks_old) + if (is.null(breaks)) breaks <- breaks_old + + checkBreaksList(data, breaks) + + ## hmm - will later on set breaks on the found survival scale + if (!is.null(e1.breaks)) checkBreaksList(data, e1.breaks) + + ## prep & subset data -------------------------------------------------------- + subset <- substitute(subset) + subset <- evalLogicalSubset(data, subset) + + x <- setDT(data[subset, ]) + forceLexisDT(x, breaks = breaks_old, allScales = tscales_all) + + ## ensure variables to merge pophaz datas by are kept ------------------------ + ## NOTE: temp var names avoid conflicts down the line + avoid <- unique(c(names(data), names(x), names(pophaz), names(e1.pophaz))) + + pophaz_vars <- c(names(pophaz), names(e1.pophaz)) + pophaz_vars <- setdiff(pophaz_vars, c(tscales_all, "haz")) + pophaz_vars <- intersect(pophaz_vars, names(x)) + pophaz_vars_tmp <- makeTempVarName(names = avoid, pre = pophaz_vars) + if (!length(pophaz_vars)) { + pophaz_vars_tmp <- NULL + } else { + pophaz_vars_wh <- which(pophaz_vars %in% names(pophaz)) + if (sum(pophaz_vars_wh)) { + setnames(pophaz, old = pophaz_vars[pophaz_vars_wh], + new = pophaz_vars_tmp[pophaz_vars_wh]) + } + pophaz_vars_wh <- which(pophaz_vars %in% names(e1.pophaz)) + if (sum(pophaz_vars_wh)) { + setnames(e1.pophaz, old = pophaz_vars[pophaz_vars_wh], + new = pophaz_vars_tmp[pophaz_vars_wh]) + } + x[, (pophaz_vars_tmp) := copy(.SD), .SDcols = pophaz_vars] + } + + ## determine printing & adjusting vars --------------------------------------- + adSub <- substitute(adjust) + foList <- usePopFormula(formula, adjust = adSub, data = x, enclos = PF__, + Surv.response = "either") + + ## will avoid conflicts using temp names for tabulating variables + adjust_vars <- names(foList$adjust) + print_vars <- names(foList$print) + by_vars <- c(print_vars, adjust_vars) + + avoid <- unique(c(names(data), names(x), names(pophaz), names(e1.pophaz))) + adjust_vars_tmp <- makeTempVarName(names = avoid, pre = adjust_vars) + if (!length(adjust_vars)) adjust_vars_tmp <- NULL + avoid <- unique(c(names(data), names(x), names(pophaz), names(e1.pophaz))) + print_vars_tmp <- makeTempVarName(names = avoid, pre = print_vars) + if (!length(print_vars)) print_vars_tmp <- NULL + by_vars_tmp <- c(print_vars_tmp, adjust_vars_tmp) + + + lex_vars <- c("lex.id", tscales_all, "lex.dur", "lex.Cst", "lex.Xst") + setcolsnull(x, keep = c(lex_vars, pophaz_vars_tmp), soft = FALSE) + if (length(adjust_vars) > 0L) x[, (adjust_vars_tmp) := foList$adjust] + if (length(print_vars) > 0L) x[, (print_vars_tmp) := foList$print] + + ## formula for survtab: we estimate survivals by all levels of both + ## print and adjust; adjusting here means computing directly adjusted + ## estimates of the mean survival time, so mean survival times are + ## weighted later on. + + formula <- paste0(deparse(formula[[2L]]), " ~ ") + if (length(c(adjust_vars_tmp, print_vars_tmp)) > 0L) { + formula <- paste0(formula, paste0(c(print_vars_tmp, adjust_vars_tmp), + collapse = " + ")) + } else { + formula <- paste0(formula, "1") + } + formula <- as.formula(formula) + + ## detect survival time scale ------------------------------------------------ + tscale_surv <- detectSurvivalTimeScale(lex = x, values = foList$y$time) + + ## check weights & adjust ---------------------------------------------------- + test_obs <- x[, .(obs=.N), keyby=eval(TF__$by_vars_tmp)] + if (length(by_vars)) setnames(test_obs, by_vars_tmp, by_vars) + if (length(weights) && !length(adjust_vars)) { + weights <- NULL + warning("Replaced weights with NULL due to not supplying variables to ", + "adjust by.") + } + mwDTtest <- makeWeightsDT(test_obs, values = list("obs"), print = print_vars, + adjust = adjust_vars, weights = weights, + internal.weights.values = "obs") + if (length(by_vars)) setnames(test_obs, by_vars, by_vars_tmp) + + ## figure out extrapolation breaks ------------------------------------------- + ## now that the survival time scale is known this can actually be done. + + if (is.null(e1.breaks)) { + e1.breaks <- copy(breaks[tscale_surv]) + addBreaks <- max(e1.breaks[[tscale_surv]]) + + c(seq(0,1,1/12), seq(1.2, 1.8, 0.2), 2:19, seq(20, 50, 5)) + e1.breaks[[tscale_surv]] <- unique(c(e1.breaks[[tscale_surv]], addBreaks)) + + checkBreaksList(x, e1.breaks) + } + if (!tscale_surv %in% names(e1.breaks)) { + stop("The survival time scale must be included in the list of breaks ", + "to extrapolate by ('e1.breaks').") + } + if (!all(breaks[[tscale_surv]] %in% e1.breaks[[tscale_surv]])) { + stop("The vector of breaks in 'breaks' for the survival time scale MUST", + "be a subset of the breaks for the survival time scale in ", + "'e1.breaks'. E.g. the former could be 0:10 and the latter 0:100.") + } + + if (verbose) { + cat("Time taken by prepping data:", timetaken(pt), "\n") + } + + + ## compute observed survivals ------------------------------------------------ + ## NOTE: do not adjust here; adjust in original formula means weighting + ## the mean survival time results. + + st <- survtab(formula, data = x, breaks = breaks, + pophaz = pophaz, + relsurv.method = "e2", + surv.type = "surv.rel", + surv.method = surv.method) + + st_keep_vars <- c(by_vars_tmp, "Tstop", "r.e2", "surv.obs") + all_names_present( + st, st_keep_vars, + msg = paste0("Internal error: expected to have variables ", + "%%VARS%% after computing observed survivals ", + "but didn't. Blame the package maintainer if you ", + "see this.") + ) + setcolsnull(st, keep = st_keep_vars, colorder = TRUE) + setDT(st) + setkeyv(st, c(by_vars_tmp, "Tstop")) + st[, "Tstart" := c(0, Tstop[-.N]), by = eval(by_vars_tmp)] + + ## decumulate for later cumulation + st[, c("r.e2", "surv.obs") := lapply(.SD, function(col) col/c(1, col[-.N])), + by = eval(by_vars_tmp), + .SDcols = c("r.e2", "surv.obs") + ] + + + if (verbose) { + cat("Time taken by estimating relative survival curves:", + timetaken(pt), "\n") + } + + ## compute overall expected survival ----------------------------------------- + ## 1) take only those individuals that were diagnosed in the time window + ## defined by breaks list in argument 'breaks' + pt <- proc.time() + setkeyv(x, c("lex.id", tscale_surv)) + tol <- .Machine$double.eps^0.5 + xe <- unique(x, by = key(x))[x[[tscale_surv]] < TF__$tol, ] ## pick rows with entry to FU + + if (length(breaks) > 1L) { + ## e.g. a period window was defined and we only use subjects + ## entering follow-up in the time window. + breaks_drop_tmp <- setdiff(names(breaks), tscale_surv) + breaks_drop_tmp <- breaks[breaks_drop_tmp] + breaks_drop_tmp <- lapply(breaks_drop_tmp, range) + + expr <- mapply(function(ch, ra) { + paste0("between(", ch, ", ", ra[1], ", ", ra[2] - tol, ", incbounds = TRUE)") + }, ch = names(breaks_drop_tmp), ra = breaks_drop_tmp, SIMPLIFY = FALSE) + + expr <- lapply(expr, function(e) eval(parse(text = e), envir = xe)) + setDT(expr) + expr <- expr[, rowSums(.SD)] == ncol(expr) + xe <- xe[expr, ] + } + + xe <- x[lex.id %in% unique(xe[["lex.id"]])] + forceLexisDT(xe, breaks = breaks_old, allScales = tscales_all, key = FALSE) + + ## 2) compute Ederer I expected survival curves from T = 0 till e.g. T = 100 + e1 <- comp_e1(xe, breaks = e1.breaks, pophaz = e1.pophaz, immortal = TRUE, + survScale = tscale_surv, by = by_vars_tmp, id = "lex.id") + setnames(e1, tscale_surv, "Tstop") + e1[, "Tstart" := c(0, Tstop[-.N]), by = eval(by_vars_tmp)] + e1[, "surv.int" := cut(Tstart, breaks = e1.breaks[[tscale_surv]], + right = FALSE, labels = FALSE)] + e1[, "delta" := Tstop - Tstart] + + ## decumulate for later cumulation + e1[, "surv.exp" := surv.exp/c(1, surv.exp[-.N]), by = eval(by_vars_tmp)] + + if (verbose) { + cat("Time taken by computing overall expected survival curves:", + timetaken(pt), "\n") + } + + ## compute counts of subjects ------------------------------------------------ + ## these correspond to the counts of patients for which expected survival + ## was computed. If observed survival is e.g. a period estimated curve, + ## we only use subjects entering follow-up in the period window. + N_subjects <- xe[!duplicated(lex.id)][, + list(obs=.N), + keyby=eval(by_vars_tmp) + ] + + ## combine all estimates into one data set ----------------------------------- + pt <- proc.time() + + st[, "surv.int" := cut(Tstart, breaks = e1.breaks[[tscale_surv]], + right = FALSE, labels = FALSE)] + + x <- merge(e1, st[, .SD, .SDcols = c(by_vars_tmp, "surv.int", "r.e2", "surv.obs")], + by = c(by_vars_tmp,"surv.int"), all = TRUE) + setkeyv(x, c(by_vars_tmp, "surv.int")) + + ## extrapolation RSR definition ---------------------------------------------- + if (is.numeric(r)) { + ## manually given RSR for extrapolated part of the obs.surv curve + ## here it is assumed that r is annualized + set(x, j = "last.p.e2", value = r^x[["delta"]]) + + + } else { + ## add last non-NA values as separate column + + st <- st[, .SD[(.N-TF__$auto_ints+1):.N], by = eval(by_vars_tmp)] + + st[, "delta" := Tstop - Tstart] + st[, "r.e2" := r.e2^(1/delta)] ## "annualized" RSRs + + ## mean annualized RSR in last N intervas by strata + st <- st[, .(last.p.e2 = mean(r.e2)), by = eval(by_vars_tmp)] + st[, "last.p.e2" := pmin(1, last.p.e2)] + if (verbose) { + cat("Using following table of mean RSR estimates", + "(scaled to RSRs applicable to a time interval one", + "unit of time wide, e.g. one year or one day)", + "based on", auto_ints, "interval(s) from the end of the relative", + "survival curve by strata: \n") + prST <- data.table(st) + setnames(prST, c(by_vars_tmp, "last.p.e2"), c(by_vars, "RSR")) + print(prST) + } + + if (length(by_vars_tmp)) { + x <- merge(x, st, by = by_vars_tmp, all = TRUE) + } else { + set(x, j = "last.p.e2", value = st$last.p.e2) + } + x[, "last.p.e2" := last.p.e2^(delta)] ## back to non-annualized RSRs + ## enforce RSR in extrapolated part of observed curve to at most 1 + x[, "last.p.e2" := pmin(last.p.e2, 1)] + } + + x[is.na(r.e2), "r.e2" := last.p.e2] + x[, "surv" := r.e2*surv.exp] + # setnames(x, "surv.obs", "surv") + # x[is.na(surv), "surv" := surv.exp*last.p.e2] + + ## cumulate again + setkeyv(x, c(by_vars_tmp, "surv.int")) + x[, c("surv", "surv.exp") := lapply(.SD, cumprod), + .SDcols = c("surv", "surv.exp"), by = eval(by_vars_tmp)] + + x2 <- copy(x) + x[, "surv.exp" := NULL] + x2[, "surv" := NULL] + setnames(x2, "surv.exp", "surv") + x <- rbind(x, x2) + x[, "survmean_type" := rep(c("est", "exp"), each = nrow(x2))] + + setcolsnull( + x, + keep = c(by_vars_tmp, "survmean_type", + "surv.int", "Tstart", "Tstop", + "delta", "surv", "surv.exp"), + colorder = TRUE + ) + + ## check curve convergence to zero ------------------------------------------- + ## a good integration is based on curves that get very close to + ## zero in the end + mi <- x[, .(surv = round(min(surv),4)*100), + keyby = eval(c(by_vars_tmp, "survmean_type"))] + + if (any(mi$surv > 1)) { + warning("One or several of the curves used to compute mean survival times ", + "or expected mean survival times was > 1 % at the lowest point. ", + "Mean survival estimates may be significantly biased. To avoid ", + "this, supply breaks to 'e1.breaks' which make the curves longer ", + ", e.g. e1.breaks = list(FUT = 0:150) where time scale FUT ", + "is the survival time scale (yours may have a different name).") + } + mi[, "surv" := paste0(formatC(surv, digits = 2, format = "f"), " %")] + mi[, "survmean_type" := factor(survmean_type, c("est", "exp"), + c("Observed", "Expected"))] + setnames(mi, c("survmean_type", "surv"), + c("Obs./Exp. curve", "Lowest value")) + if (length(by_vars)) setnames(mi, by_vars_tmp, by_vars) + if (verbose) { + cat("Lowest points in observed / expected survival curves by strata:\n") + print(mi) + } + + ## integrating by trapezoid areas -------------------------------------------- + ## trapezoid area: WIDTH*(HEIGHT1 + HEIGHT2)/2 + ## so we compute "average interval survivals" for each interval t_i + ## and multiply with interval length. + + setkeyv(x, c(by_vars_tmp, "survmean_type", "Tstop")) + sm <- x[, .(survmean = sum(delta*(surv + c(1, surv[-.N]))/2L)), + keyby = c(by_vars_tmp, "survmean_type")] + + ## cast ---------------------------------------------------------------------- + + sm <- cast_simple(sm, columns = "survmean_type", + rows = by_vars_tmp, values = "survmean") + + ## add numbers of subjects, compute YPLL ------------------------------------- + setkeyv(sm, by_vars_tmp); setkeyv(N_subjects, by_vars_tmp) + sm[, "obs" := N_subjects$obs] + sm[, "YPLL" := (exp-est)*obs] + + + ## adjusting ----------------------------------------------------------------- + + sm <- makeWeightsDT(sm, values = list(c("est", "exp", "obs", "YPLL")), + print = print_vars_tmp, adjust = adjust_vars_tmp, + weights = weights, internal.weights.values = "obs") + if (length(adjust_vars)) { + vv <- c("est", "exp", "obs", "YPLL") + sm[, c("est", "exp") := lapply(.SD, function(col) col*sm$weights), + .SDcols = c("est", "exp")] + sm <- sm[, lapply(.SD, sum), .SDcols = vv, by = eval(print_vars_tmp)] + } + + if (verbose) { + cat("Time taken by final touches:", timetaken(pt), "\n") + } + + ## final touch --------------------------------------------------------------- + if (length(print_vars)) setnames(sm, print_vars_tmp, print_vars) + + at <- list(call = match.call(), + formula = attr_form, + print = print_vars, + adjust = adjust_vars, + tprint = print_vars_tmp, + tadjust = adjust_vars_tmp, + breaks = breaks, + e1.breaks = e1.breaks, + survScale = tscale_surv, + curves = copy(x)) + setattr(sm, "class", c("survmean","data.table", "data.frame")) + setattr(sm, "survmean.meta", at) + if (!return_DT()) setDFpe(sm) + return(sm[]) +} + diff --git a/R/popEpi_package.r b/R/popEpi_package.r index 7964ef3..af98773 100644 --- a/R/popEpi_package.r +++ b/R/popEpi_package.r @@ -1,30 +1,30 @@ -#' popEpi -#' -#' @name popEpi -#' @docType package -#' @title popEpi: Functions for large-scale epidemiological analysis -#' @description -#' \pkg{popEpi} is built for the needs of registry-based (large-scale) -#' epidemiological analysis. This is in most part enabled by the -#' efficient \pkg{data.table} package for handling and aggregating large data sets. -#' -#' \pkg{popEpi} currently supplies some utility functions such as \code{\link{splitMulti}} -#' and \code{\link{get.yrs}} for preparing large data sets for epidemiological analysis. -#' Included are also a a few functions that can be used in -#' epidemiological analysis such as \code{\link{sir}} for estimating -#' standardized incidence/mortality ratios (SIRs/SMRs) and \code{\link{survtab}} for -#' estimating observed and relative/net survival as well as cumulative incidence -#' functions (CIFs). -#' -#' Since there are many benefits to using \code{data.tables}, \pkg{popEpi} returns -#' outputs by default in the \code{data.table} format where appropriate. -#' Since \code{data.table} -#' objects are usually modified by reference, this may have surprising side -#' effects for users uninitiated in using \code{data.table}. To ensure -#' that appropriate outputs are in the \code{data.frame} format, set -#' \code{options("popEpi.datatable" = FALSE)}. However, \code{data.table} -#' usage is recommended due to better performance and testing coverage. -#' \code{data.table} is used -#' by most functions internally in both cases. -#' -NULL +#' popEpi +#' +#' @name popEpi +#' @docType package +#' @title popEpi: Functions for large-scale epidemiological analysis +#' @description +#' \pkg{popEpi} is built for the needs of registry-based (large-scale) +#' epidemiological analysis. This is in most part enabled by the +#' efficient \pkg{data.table} package for handling and aggregating large data sets. +#' +#' \pkg{popEpi} currently supplies some utility functions such as \code{\link{splitMulti}} +#' and \code{\link{get.yrs}} for preparing large data sets for epidemiological analysis. +#' Included are also a a few functions that can be used in +#' epidemiological analysis such as \code{\link{sir}} for estimating +#' standardized incidence/mortality ratios (SIRs/SMRs) and \code{\link{survtab}} for +#' estimating observed and relative/net survival as well as cumulative incidence +#' functions (CIFs). +#' +#' Since there are many benefits to using \code{data.tables}, \pkg{popEpi} returns +#' outputs by default in the \code{data.table} format where appropriate. +#' Since \code{data.table} +#' objects are usually modified by reference, this may have surprising side +#' effects for users uninitiated in using \code{data.table}. To ensure +#' that appropriate outputs are in the \code{data.frame} format, set +#' \code{options("popEpi.datatable" = FALSE)}. However, \code{data.table} +#' usage is recommended due to better performance and testing coverage. +#' \code{data.table} is used +#' by most functions internally in both cases. +#' +NULL diff --git a/R/pophaz.R b/R/pophaz.R index 71f832c..edcde2c 100644 --- a/R/pophaz.R +++ b/R/pophaz.R @@ -1,68 +1,68 @@ - - - -#' @title Expected / Population Hazard Data Sets Usage in \pkg{popEpi} -#' @author Joonas Miettinen -#' @name pophaz -#' @description -#' -#' Several functions in \pkg{popEpi} make use of population or expected -#' hazards in computing the intended estimates (e.g. \code{\link{survtab}}). -#' This document explains using such data sets in this package. -#' -#' @details -#' -#' Population hazard data sets (pophaz for short) in \pkg{popEpi} should -#' be \code{data.frame}s in the "long" format where one of the columns must be -#' named \code{haz} (for hazard), and other columns define the values or -#' levels in variables relating to subjects in your data. For example, -#' \code{\link{popmort}} contains Finnish population mortality hazards -#' by sex, calendar year, and 1-year age group. -#' -#' \tabular{rrrr}{ -#' \code{sex} \tab \code{year} \tab \code{agegroup} \tab \code{haz} \cr -#' 0 \tab 1951 \tab 0 \tab 0.036363176\cr -#' 0 \tab 1951 \tab 1 \tab 0.003616547\cr -#' 0 \tab 1951 \tab 2 \tab 0.002172384\cr -#' 0 \tab 1951 \tab 3 \tab 0.001581249\cr -#' 0 \tab 1951 \tab 4 \tab 0.001180690\cr -#' 0 \tab 1951 \tab 5 \tab 0.001070595 -#' } -#' -#' The names of the columns should match to the names of the variables -#' that you have in your subject-level data. Time variables in your pophaz -#' may also correspond to \code{Lexis} time scales; see -#' \code{\link{survtab}}. -#' -#' Any time variables (as they usually have) should be coded consistently: -#' When using fractional years in your data, the time variables in your pophaz -#' must also be coded in fractional years. When using e.g. \code{Date}s in your -#' data, ensure that the pophaz time variables are coded at the level of days -#' (or \code{Date}s for calendar time). -#' -#' The \code{haz} variable in your pophaz should also be coded consistently -#' with the used time variables. E.g. \code{haz} values in life-tables -#' reported as deaths per person-year should be multiplied by 365.25 when -#' using day-level time variables. -#' -#' If you have your population hazards in a \code{ratetable} object -#' usable by functions in \pkg{survival} and \pkg{relsurv}, you may -#' transform them to long-format \code{data.frame}s using -#' \code{\link{as.data.frame.ratetable}}. Ensure, however, that the -#' created \code{haz} column is coded at the right level (events per -#' days or years typically). -#' -#' National statistical institutions, the WHO, and e.g. the Human -#' Life-Table Database supply life-table data. -#' - -NULL - - - - - - - - - + + + +#' @title Expected / Population Hazard Data Sets Usage in \pkg{popEpi} +#' @author Joonas Miettinen +#' @name pophaz +#' @description +#' +#' Several functions in \pkg{popEpi} make use of population or expected +#' hazards in computing the intended estimates (e.g. \code{\link{survtab}}). +#' This document explains using such data sets in this package. +#' +#' @details +#' +#' Population hazard data sets (pophaz for short) in \pkg{popEpi} should +#' be \code{data.frame}s in the "long" format where one of the columns must be +#' named \code{haz} (for hazard), and other columns define the values or +#' levels in variables relating to subjects in your data. For example, +#' \code{\link{popmort}} contains Finnish population mortality hazards +#' by sex, calendar year, and 1-year age group. +#' +#' \tabular{rrrr}{ +#' \code{sex} \tab \code{year} \tab \code{agegroup} \tab \code{haz} \cr +#' 0 \tab 1951 \tab 0 \tab 0.036363176\cr +#' 0 \tab 1951 \tab 1 \tab 0.003616547\cr +#' 0 \tab 1951 \tab 2 \tab 0.002172384\cr +#' 0 \tab 1951 \tab 3 \tab 0.001581249\cr +#' 0 \tab 1951 \tab 4 \tab 0.001180690\cr +#' 0 \tab 1951 \tab 5 \tab 0.001070595 +#' } +#' +#' The names of the columns should match to the names of the variables +#' that you have in your subject-level data. Time variables in your pophaz +#' may also correspond to \code{Lexis} time scales; see +#' \code{\link{survtab}}. +#' +#' Any time variables (as they usually have) should be coded consistently: +#' When using fractional years in your data, the time variables in your pophaz +#' must also be coded in fractional years. When using e.g. \code{Date}s in your +#' data, ensure that the pophaz time variables are coded at the level of days +#' (or \code{Date}s for calendar time). +#' +#' The \code{haz} variable in your pophaz should also be coded consistently +#' with the used time variables. E.g. \code{haz} values in life-tables +#' reported as deaths per person-year should be multiplied by 365.25 when +#' using day-level time variables. +#' +#' If you have your population hazards in a \code{ratetable} object +#' usable by functions in \pkg{survival} and \pkg{relsurv}, you may +#' transform them to long-format \code{data.frame}s using +#' \code{\link{as.data.frame.ratetable}}. Ensure, however, that the +#' created \code{haz} column is coded at the right level (events per +#' days or years typically). +#' +#' National statistical institutions, the WHO, and e.g. the Human +#' Life-Table Database supply life-table data. +#' + +NULL + + + + + + + + + diff --git a/R/prevalence.R b/R/prevalence.R index 89a9b64..61a5a4b 100644 --- a/R/prevalence.R +++ b/R/prevalence.R @@ -1,141 +1,141 @@ - - - - -prevtab <- function( - formula, - data, - meanpop = NULL, - breaks = NULL, - adjust = NULL, - weights = NULL, - subset = NULL, - verbose = FALSE -) { - - PF <- parent.frame(1L) - TF <- environment() - - checkLexisData(data) - checkPophaz(data, meanpop, haz.name = "meanpop") - meanpop <- data.table(meanpop) - - allScales <- attr(data, "time.scales") - oldBreaks <- attr(data, "breaks") - - lexis_vars <- c(allScales, "lex.dur", "lex.id", "lex.Cst", "lex.Xst") - lexis_vars <- intersect(names(data), lexis_vars) - - meanpop_vars <- setdiff(names(meanpop), "meanpop") - - print_vars <- print_vars_tmp <- NULL - adjust_vars <- adjust_vars_tmp <- NULL - - all_vars <- unique(intersect(names(data), c(lexis_vars, print_vars, adjust_vars, meanpop_vars))) - - ## appease R CMD CHECK ------------------------------------------------------- - at.risk <- NULL - - - ## subsetting ---------------------------------------------------------------- - sb <- substitute(subset) - subset <- evalLogicalSubset(data, substiset = sb, enclos = PF) - - ## data with only time scales and by variables ------------------------------- - - data = data[subset, ] - by_data <- usePopFormula(formula, Surv.response = FALSE, - data = data, enclos = PF) - x <- setDT(mget(lexis_vars, as.environment(data))) - if (!is.null(by_data[["print"]])) { - print_vars <- names(by_data[["print"]]) - print_vars_tmp <- makeTempVarName(names = all_vars, pre = print_vars) - set(x, j = print_vars_tmp, value = by_data[["print"]]) - } - if (!is.null(by_data[["adjust"]])) { - adjust_vars <- names(by_data[["adjust"]]) - adjust_vars_tmp <- makeTempVarName(names = all_vars, pre = adjust_vars) - set(x, j = adjust_vars_tmp, value = by_data[["adjust"]]) - } - forceLexisDT(x, breaks = oldBreaks, allScales = allScales) - rm("by_data") - x[, c("lex.Cst", "lex.Xst") := 0L] - - by_vars <- c(print_vars, adjust_vars) - by_vars_tmp <- c(print_vars_tmp, adjust_vars_tmp) - - meanpop_vars_tmp <- makeTempVarName( - names = c(names(x), names(data), names(meanpop)), pre = meanpop_vars - ) - meanpop_vars_tmp <- unlist(lapply(seq_along(meanpop_vars), function(i) { - if (!meanpop_vars[i] %in% by_vars) { - return(meanpop_vars_tmp[i]) - } - wh <- which(by_vars == meanpop_vars[i]) - by_vars_tmp[wh] - })) - - lapply(seq_along(meanpop_vars), function(i) { - set(x, j = meanpop_vars_tmp[i], value = data[[meanpop_vars[i]]]) - }) - - ## Splitting to ensure breaks exist; also takes copy ------------------------- - x <- splitMulti(x, breaks = breaks, drop = TRUE, merge = TRUE) - forceLexisDT(x, breaks = attr(x, "breaks"), allScales = allScales) - - newBreaks <- copy(attr(x, "breaks")) - - - ## detect prevalence time scale ---------------------------------------------- - prevScale <- detectSurvivalTimeScale(data, eval(formula[[2]], envir = data)) - - - ## limit to prevalence time points ------------------------------------------- - ## since we want prevalence at certain points of time along the prevalence - ## time scale, prevScale values not at the breaks are not considered at all. - j <- list(newBreaks[[prevScale]]) - names(j) <- prevScale - x <- x[j, on = prevScale, nomatch = 0L] - - ## aggregate ----------------------------------------------------------------- - aggre_vars <- unique(c(print_vars_tmp, adjust_vars_tmp, meanpop_vars_tmp)) - print(aggre_vars) - ag <- aggre(x, by = aggre_vars) - ag <- setDT(ag) - - setkeyv( - ag, c(aggre_vars, setdiff(aggre_vars, intersect(lexis_vars, print_vars_tmp))) - ) - ag[, "n" := cumsum(at.risk), by = eval(aggre_vars)] - ag <- setDT(mget(c(aggre_vars, meanpop_vars_tmp, "n"), as.environment(ag))) - - ## compute prevalence rates if appropriate ----------------------------------- - print(ag) - ag <- merge(ag, meanpop, by = meanpop_vars_tmp) - - if (length(c(print_vars_tmp, adjust_vars_tmp))) { - setnames(ag, c(print_vars_tmp, adjust_vars_tmp), - c(print_vars, adjust_vars_tmp)) - } - ag <- rate(data = ag, obs = "n", pyrs = "meanpop", print = print_vars, - adjust = adjust_vars, weights = weights) - - return(ag) -} - - - - -prevtab_ag <- function( - formula, - data, - meanpop = NULL, - adjust = NULL, - weights = NULL, - subset = NULL, - verbose = FALSE -) { - ## prevtab(per ~ sex + fot) - - -} + + + + +prevtab <- function( + formula, + data, + meanpop = NULL, + breaks = NULL, + adjust = NULL, + weights = NULL, + subset = NULL, + verbose = FALSE +) { + + PF <- parent.frame(1L) + TF <- environment() + + checkLexisData(data) + checkPophaz(data, meanpop, haz.name = "meanpop") + meanpop <- data.table(meanpop) + + allScales <- attr(data, "time.scales") + oldBreaks <- attr(data, "breaks") + + lexis_vars <- c(allScales, "lex.dur", "lex.id", "lex.Cst", "lex.Xst") + lexis_vars <- intersect(names(data), lexis_vars) + + meanpop_vars <- setdiff(names(meanpop), "meanpop") + + print_vars <- print_vars_tmp <- NULL + adjust_vars <- adjust_vars_tmp <- NULL + + all_vars <- unique(intersect(names(data), c(lexis_vars, print_vars, adjust_vars, meanpop_vars))) + + ## appease R CMD CHECK ------------------------------------------------------- + at.risk <- NULL + + + ## subsetting ---------------------------------------------------------------- + sb <- substitute(subset) + subset <- evalLogicalSubset(data, substiset = sb, enclos = PF) + + ## data with only time scales and by variables ------------------------------- + + data = data[subset, ] + by_data <- usePopFormula(formula, Surv.response = FALSE, + data = data, enclos = PF) + x <- setDT(mget(lexis_vars, as.environment(data))) + if (!is.null(by_data[["print"]])) { + print_vars <- names(by_data[["print"]]) + print_vars_tmp <- makeTempVarName(names = all_vars, pre = print_vars) + set(x, j = print_vars_tmp, value = by_data[["print"]]) + } + if (!is.null(by_data[["adjust"]])) { + adjust_vars <- names(by_data[["adjust"]]) + adjust_vars_tmp <- makeTempVarName(names = all_vars, pre = adjust_vars) + set(x, j = adjust_vars_tmp, value = by_data[["adjust"]]) + } + forceLexisDT(x, breaks = oldBreaks, allScales = allScales) + rm("by_data") + x[, c("lex.Cst", "lex.Xst") := 0L] + + by_vars <- c(print_vars, adjust_vars) + by_vars_tmp <- c(print_vars_tmp, adjust_vars_tmp) + + meanpop_vars_tmp <- makeTempVarName( + names = c(names(x), names(data), names(meanpop)), pre = meanpop_vars + ) + meanpop_vars_tmp <- unlist(lapply(seq_along(meanpop_vars), function(i) { + if (!meanpop_vars[i] %in% by_vars) { + return(meanpop_vars_tmp[i]) + } + wh <- which(by_vars == meanpop_vars[i]) + by_vars_tmp[wh] + })) + + lapply(seq_along(meanpop_vars), function(i) { + set(x, j = meanpop_vars_tmp[i], value = data[[meanpop_vars[i]]]) + }) + + ## Splitting to ensure breaks exist; also takes copy ------------------------- + x <- splitMulti(x, breaks = breaks, drop = TRUE, merge = TRUE) + forceLexisDT(x, breaks = attr(x, "breaks"), allScales = allScales) + + newBreaks <- copy(attr(x, "breaks")) + + + ## detect prevalence time scale ---------------------------------------------- + prevScale <- detectSurvivalTimeScale(data, eval(formula[[2]], envir = data)) + + + ## limit to prevalence time points ------------------------------------------- + ## since we want prevalence at certain points of time along the prevalence + ## time scale, prevScale values not at the breaks are not considered at all. + j <- list(newBreaks[[prevScale]]) + names(j) <- prevScale + x <- x[j, on = prevScale, nomatch = 0L] + + ## aggregate ----------------------------------------------------------------- + aggre_vars <- unique(c(print_vars_tmp, adjust_vars_tmp, meanpop_vars_tmp)) + print(aggre_vars) + ag <- aggre(x, by = aggre_vars) + ag <- setDT(ag) + + setkeyv( + ag, c(aggre_vars, setdiff(aggre_vars, intersect(lexis_vars, print_vars_tmp))) + ) + ag[, "n" := cumsum(at.risk), by = eval(aggre_vars)] + ag <- setDT(mget(c(aggre_vars, meanpop_vars_tmp, "n"), as.environment(ag))) + + ## compute prevalence rates if appropriate ----------------------------------- + print(ag) + ag <- merge(ag, meanpop, by = meanpop_vars_tmp) + + if (length(c(print_vars_tmp, adjust_vars_tmp))) { + setnames(ag, c(print_vars_tmp, adjust_vars_tmp), + c(print_vars, adjust_vars_tmp)) + } + ag <- rate(data = ag, obs = "n", pyrs = "meanpop", print = print_vars, + adjust = adjust_vars, weights = weights) + + return(ag) +} + + + + +prevtab_ag <- function( + formula, + data, + meanpop = NULL, + adjust = NULL, + weights = NULL, + subset = NULL, + verbose = FALSE +) { + ## prevtab(per ~ sex + fot) + + +} diff --git a/R/relative_poisson.R b/R/relative_poisson.R index de0ca6a..f58abdd 100644 --- a/R/relative_poisson.R +++ b/R/relative_poisson.R @@ -1,610 +1,610 @@ -#' @title Excess hazard Poisson model -#' @author Joonas Miettinen, Karri Seppa -#' @description Estimate a Poisson piecewise constant excess -#' hazards model -#' @param data a dataset split with e.g. \code{\link{lexpand}}; -#' must have expected hazard merged within -#' @param formula a formula which is passed on to \code{glm}; see Details -#' @param fot.breaks optional; a numeric vector of [a,b) breaks to specify -#' survival intervals over the follow-up time; if \code{NULL}, the -#' existing breaks along the mandatory \code{fot} time scale in \code{data} -#' are used (e.g. the breaks for \code{fot} supplied to \code{lexpand}) -#' @param subset a logical vector or condition; e.g. \code{subset = sex == 1}; -#' limits the data before estimation -#' @param check logical; if \code{TRUE}, tabulates excess cases by all -#' factor variables in the formula to check for negative / \code{NA} -#' excess cases before fitting the GLM -#' @param ... any argument passed on to \code{glm} -#' @import stats -#' @details -#' -#' \strong{Basics} -#' -#' \code{relpois} employs a custom link function of the Poisson variety -#' to estimate piecewise constant parametric excess hazards. The pieces -#' are determined by \code{fot.breaks}. A \code{log(person-years)} offset -#' is passed automatically to the \code{glm} call. -#' -#' \strong{Formula usage} -#' -#' The formula can be used like any ordinary \code{glm} formula. The user must -#' define the outcome in some manner, which is usually \code{lex.Xst} after splitting -#' with e.g. \code{lexpand}. The exception is the possibility of including -#' the baseline excess hazard terms by including the -#' reserved term \code{FOT} in the formula. -#' -#' For example, \code{lex.Xst != 0 ~ FOT + agegr} estimates a model with constant -#' excess hazards at the follow-up intervals as specified by -#' the pertinent breaks used in splitting \code{data}, -#' as well as for the different age groups. -#' \code{FOT} is created ad hoc if it is used in the formula. -#' If you leave out \code{FOT}, the hazard is effectively -#' assumed to be constant across the whole follow-up time. -#' -#' You can also simply use your own follow-up time interval variable that -#' you have created before calling \code{relpois}. However, when using -#' \code{FOT}, \code{relpois} automatically checks for e.g. -#' negative excess cases in follow-up intervals, -#' allowing for quickly finding splitting breaks -#' where model estimation is possible. It also drops any data outside the -#' follow-up time window. -#' -#' \strong{Splitting and merging population hazard} -#' -#' The easiest way to both split and to include population hazard information is -#' by using \code{\link{lexpand}}. You may also fairly easily do it by hand -#' by splitting first and then merging in your population hazard information. -#' -#' -#' \strong{Data requirements} -#' -#' The population hazard information must be available for each record and named -#' \code{pop.haz}. The follow-up time variable must be named \code{"fot"} e.g. -#' as a result of using \code{lexpand}. The \code{lex.dur} variable must also -#' be present, containing person-year information. -#' -#' -#' @return -#' A \code{glm} object created using a custom Poisson family construct. Some -#' \code{glm} methods are applicable. -#' -#' @seealso -#' \code{\link{lexpand}}, \code{\link{poisson}}, \code{\link{glm}} -#' @family main functions -#' @family relpois functions -#' @export relpois -#' -#' @examples -#' ## use the simulated rectal cancer cohort -#' data("sire", package = "popEpi") -#' sire$agegr <- cut(sire$dg_age, c(0,45,60,Inf), right=FALSE) -#' -#' ## usable straight away after splitting -#' fb <- c(0,3/12,6/12,1,2,3,4,5) -#' x <- lexpand(sire, birth = bi_date, entry = dg_date, -#' exit = ex_date, status=status, -#' breaks = list(fot=fb), pophaz=popmort) -#' rpm <- relpois(x, formula = lex.Xst %in% 1:2 ~ FOT + agegr) -#' -#' ## some methods for glm work. e.g. test for interaction -#' \dontrun{ -#' rpm2 <- relpois(x, formula = lex.Xst %in% 1:2 ~ FOT*agegr) -#' anova(rpm, rpm2, test="LRT") -#' AIC(rpm, rpm2) -#' ## update won't work currently -#' } - - -relpois <- function(data, - formula, - fot.breaks = NULL, subset = NULL, check=TRUE, ...) { - ## R CMD CHECK appeasement - lex.dur <- NULL - - ## prep arguments ------------------------------------------------------------ - excess_cases <- fot <- pop.haz <- NULL ## appease R CMD CHECK - - ## somehow the class of the data is being altered by this function - oldClass <- class(data) - - if (missing(formula) || !inherits(formula, "formula")) stop("formula not defined") - - form_vars <- all.vars(formula) - dataname <- as.name(deparse(substitute(data))) - - if (!inherits(data, "Lexis")) { - stop("data is not a Lexis object; data must be a result of splitting or using Lexis") - } - - if ("FOT" %in% names(data)) { - stop("FOT is a reserved name but you have a variable with that name in data; rename/delete it first") - } - - # wasDF <- FALSE - if (!is.data.table(data)) { - data <- copy(data) - setDT(data) - message("Took a copy of your data because it was a data.frame and not a data.table. This may take up a lot of memory.") - message("It is recommended to convert your data to data.table before using this function using as.data.table or setDT") - } - - req_vars <- unique(c("lex.id", "fot", "lex.dur", "pop.haz", setdiff(form_vars, "FOT"))) - all_names_present(data, req_vars) - - surv.breaks <- attr(data, "breaks")$fot - if (is.null(surv.breaks)) { - stop("did not find any breaks information in data attributes named 'fot'; - probable reason: split data was edited after splitting - ", - "don't do that") - } else { - if (!is.null(fot.breaks)) { - if (any(!fot.breaks %in% surv.breaks)) { - stop("fot.breaks must be a subset of the breaks for 'fot' used in splitting; - type attr(data, 'breaks')$fot to see the breaks you used in splitting") - } else { - surv.breaks <- fot.breaks - } - } - - } - - ## prep & subset data -------------------------------------------------------- - subset <- substitute(subset) - subset <- evalLogicalSubset(data, subset) - - if (any(is.na(data[subset, ]$pop.haz))) { - stop("some pop.haz are NA") - } - - on.exit({ - setcolsnull(data, c("FOT", tmpdexp), soft = TRUE) - }, add = TRUE) - - if ("FOT" %in% form_vars) { - data[, "FOT" := cut(fot, breaks = surv.breaks, right = FALSE)] - # set(data, j = "FOT", value = cut(data$fot, breaks = surv.breaks, right=FALSE)) - subset <- subset & !is.na(data$FOT) - } - tmpdexp <- makeTempVarName(data, pre = "TEMP_d.exp_") - data[, c(tmpdexp) := pop.haz*lex.dur] - # set(data, j = tmpdexp, value = data$pop.haz * data$lex.dur) - - - if (check) { - ## test for negative excess cases in factor variable combinations ------------ - - ## determine factor variables for cross-tabulating - - fac_vars <- colnames(attr(terms.formula(formula), "factors")) - - fac_list <- paste0(fac_vars, collapse=", ") - fac_list <- paste0("list(", fac_list, ")") - fac_list <- parse(text=fac_list) - - wh_fac <- as.data.table(data)[subset, unlist(lapply(eval(fac_list), is.factor))] - fac_vars <- fac_vars[wh_fac] - - if (length(fac_vars) == 0) fac_vars <- NULL - fac_list <- paste0(fac_vars, collapse=", ") - fac_list <- paste0("list(", fac_list, ")") - fac_list <- parse(text=fac_list) - - - ## test negativity of excess cases - LHS <- as.character(formula) - LHS <- LHS[2] - LHS <- parse(text = LHS) - - excas <- as.data.table(data)[subset, list(excess_cases = sum(eval(LHS)-get(tmpdexp))), keyby=eval(fac_list)] - setnames(excas, 1:ncol(excas), c(fac_vars, "excess_cases")) - - - if (any(is.na(excas$excess_cases))) { - stop("some excess cases were NA; is pop.haz available for all records?") - } - excas <- excas[excess_cases <= 0] - if (any(excas$excess_cases<=0)) { - print(excas) - warning("negative excess cases found in some combinations of factor variables; - see printed table and try e.g. wider FOT intervals") - } - } - - - ## custom poisson family ----------------------------------------------------- - RPL <- copy(poisson()) - RPL$link <- "glm relative survival model with Poisson error" - RPL$linkfun <- function(mu, d.exp = data[[tmpdexp]][subset]) log(mu - d.exp) - RPL$linkinv <- function(eta, d.exp = data[[tmpdexp]][subset]) d.exp + exp(eta) - - - RPL$initialize <- substitute( { - if (any(y < 0)) stop(paste("Negative values not allowed for", - "the Poisson family")) - n <- rep.int(1, nobs) - mustart <- pmax(y, d.exp) + 0.1 - }, list(d.exp = data[[tmpdexp]][subset]) ) - - - - ## glm call ------------------------------------------------------------------ - ## update() won't work - ## anova() works - - - ml <- glm(formula = formula, data=data[subset,], offset=log(lex.dur), - # subset = subset, ## Error in xj[i] : invalid subscript type 'closure' - family = RPL, ...) - - ## final touches ------------------------------------------------------------- - ml$d.exp <- data[subset, ][[tmpdexp]] - ml$FOT <- data[subset, ]$FOT - ml$fot.breaks <- surv.breaks - ml$call$data <- dataname - ml$call$formula <- formula - setattr(ml, "class", c("relpois", "glm", "lm")) - - setattr(data, "class", oldClass) ## see beginning of function - - - ml -} - - - -#' @title Excess hazard Poisson model -#' @author Joonas Miettinen, Karri Seppa -#' @description Estimate a Poisson Piecewise Constant Excess -#' Hazards Model -#' @param formula a formula with the counts of events as the response. -#' Passed on to \code{glm}. May contain usage of the \code{offset()} function -#' instead of supplying the offset for the Poisson model via the argument -#' \code{offset}. -#' @param data an \code{aggre} object (an aggregated data set; -#' see \code{\link{as.aggre}} and \code{\link{aggre}}) -#' @param d.exp the counts of expected cases. Mandatory. -#' E.g. \code{d.exp = EXC_CASES}, where \code{EXC_CASES} is a column in data. -#' @param offset the offset for the Poisson model, supplied as e.g. -#' \code{offset = log(PTIME)}, where \code{PTIME} is a subject-time -#' variable in data. Not mandatory, but almost always should be supplied. -#' @param breaks optional; a numeric vector of [a,b) breaks to specify -#' survival intervals over the follow-up time; if \code{NULL}, the -#' existing breaks along the mandatory time scale mentioned in \code{formula} -#' are used -#' @param subset a logical vector or condition; e.g. \code{subset = sex == 1}; -#' limits the data before estimation -#' @param piecewise \code{logical}; if \code{TRUE}, and if any time scale -#' from data is used (mentioned) in the formula, the time scale is -#' transformed into a factor variable indicating intervals on the time scale. -#' Otherwise the time scale left as it is, usually a numeric variable. -#' E.g. if \code{formula = counts ~ TS1*VAR1}, \code{TS1} is transformed -#' into a factor before fitting model. -#' @param check \code{logical}; if \code{TRUE}, performs check on the -#' negativity excess cases by factor-like covariates in formula - -#' negative excess cases will very likely lead to non-converging model -#' @param ... any other argument passed on to \code{\link[stats]{glm}} such as -#' \code{control} or \code{weights} -#' @import stats -#' -#' @return -#' A \code{relpois} object created using a custom Poisson family construct. -#' -#' @seealso -#' \code{\link{lexpand}}, \code{\link{poisson}}, \code{\link{glm}} -#' @family main functions -#' @family relpois functions -#' @examples -#' ## use the simulated rectal cancer cohort -#' data(sire, package = "popEpi") -#' sire$agegr <- cut(sire$dg_age, c(0,45,60,Inf), right=FALSE) -#' -#' ## create aggregated example data -#' fb <- c(0,3/12,6/12,1,2,3,4,5) -#' x <- lexpand(sire, birth = bi_date, entry = dg_date, -#' exit = ex_date, status=status %in% 1:2, -#' breaks = list(fot=fb), -#' pophaz=popmort, pp = FALSE, -#' aggre = list(agegr, fot)) -#' -#' ## fit model using aggregated data -#' rpm <- relpois_ag(formula = from0to1 ~ fot + agegr, data = x, -#' d.exp = d.exp, offset = log(pyrs)) -#' summary(rpm) -#' -#' ## the usual functions for handling glm models work -#' rpm2 <- update(rpm, . ~ fot*agegr) -#' anova(rpm, rpm2, test="LRT") -#' AIC(rpm, rpm2) -#' -#' ## other features such as residuals or predicting are not guaranteed -#' ## to work as intended. -#' @export - -relpois_ag <- function(formula, data, d.exp, offset = NULL, breaks = NULL, subset = NULL, piecewise = TRUE, check = TRUE, ...) { - - TF <- environment() - PF <- parent.frame(1L) - original_formula <- formula - - if (!inherits(data, "aggre")) { - stop("data is not an aggre object. Please aggregate your data first using ", - "e.g. lexpand(). If your data is pre-aggregated, use as.aggre() to ", - "mark it as such.") - } - - formula <- evalRecursive(formula, env = TF, enc = PF)$arg - if (missing(formula) || !inherits(formula, "formula")) stop("formula not defined") - - - - - ## detect survival time scale ------------------------------------------------ - oldBreaks <- copy(attr(data, "breaks")) - allScales <- names(oldBreaks) - if (is.null(oldBreaks)) { - stop("data does not have breaks information. Is it a result of using ", - "aggre() or as.aggre()?") - } - survScale <- intersect(all.vars(formula), allScales) - if (length(survScale) > 1L) { - stop("Found several used time scales in formula, which is not supported ", - "(found ", paste0("'", survScale, "'", collapse = ", "), ")") - } - - ## check supplied breaks ----------------------------------------------------- - if (is.numeric(breaks)) { - breaks <- list(breaks) - names(breaks) <- survScale - } - - if (!is.null(breaks)) { - if (!all_breaks_in(breaks, oldBreaks)) { - stop("Supplied breaks must be subset of the breaks used in splitting/", - "aggregating data. See the latter using e.g. ", - "attributes(x)$aggre.meta$breaks where x is your aggregated data.") - } - } - - ## pre-find args ------------------------------------------------------------- - desub <- substitute(d.exp) - sub_d.exp <- evalRecursive(desub, env = data[1L, ], enc = PF)$argSub - offsub <- substitute(offset) - sub_offset <- evalRecursive(offsub, env = data[1L, ], enc = PF)$argSub - - ## prep & subset data -------------------------------------------------------- - subset <- substitute(subset) - subset <- evalLogicalSubset(data, subset) - - av <- c(all.vars(formula), all.vars(sub_d.exp), all.vars(sub_offset)) - av <- intersect(names(data), av) - - x <- subsetDTorDF(data, subset = subset, select = av) - - setDT(x) - setattr(x, "class", c("aggre", "data.table", "data.frame")) - - ## handle breaks ------------------------------------------------------------- - if (!is.null(breaks)) { - if (!piecewise) { - stop("Supplied breaks but piecewise = FALSE. Please select piecewise = ", - "TRUE if you want piecewise estimates defined by the breaks.") - } - - } - - cutBreaks <- breaks - othScales <- setdiff(names(oldBreaks), names(cutBreaks)) - cutBreaks[othScales] <- oldBreaks[othScales] - cutBreaks[sapply(cutBreaks, length) < 2L] <- NULL - - if (piecewise && length(cutBreaks)) { - - for (sc in names(cutBreaks)) { - set(x, j = sc, value = cut(x[[sc]], breaks = cutBreaks[[sc]], - right = FALSE, labels = FALSE)) - - pieces <- round(cutBreaks[[sc]], 2L) - pieces <- paste0("[", pieces[-length(pieces)], ", ", pieces[-1L], ")") - set(x, j = sc, value = pieces[x[[sc]]]) - } - - } - - ## eval value args ----------------------------------------------------------- - d.exp <- evalPopArg(x, sub_d.exp, enclos = PF, - DT = TRUE, recursive = TRUE) - if (is.null(d.exp)) stop("argument d.exp was not supplied") - d.exp <- rowSums(d.exp) - if (length(d.exp) == nrow(data)) d.exp <- d.exp[subset] - - offset <- evalPopArg(x, sub_offset, enclos = PF, - DT = TRUE, recursive = TRUE) - if (!is.null(offset)) offset <- rowSums(offset) - if (length(offset) == nrow(data)) offset <- offset[subset] - - ## check excess cases -------------------------------------------------------- - d <- eval(formula[[2]], envir = x, enclos = PF) - check_excess_cases(d = d, d.exp = d.exp, data = x, - formula = formula, enclos = PF) - - ## custom poisson family ----------------------------------------------------- - RPL <- copy(poisson()) - RPL$link <- "glm relative survival model with Poisson error" - RPL$linkfun <- function(mu, d.exp = TF$d.exp) { - log(mu - d.exp) - } - RPL$linkinv <- function(eta, d.exp = TF$d.exp) { - d.exp + exp(eta) - } - - RPL$initialize <- substitute( { - if (any(y < 0)) stop(paste("Negative values not allowed for", - "the Poisson family")) - n <- rep.int(1, nobs) - mustart <- pmax(y, d.exp) + 0.1 - }, list(d.exp = TF$d.exp) ) - - ## glm call ------------------------------------------------------------------ - - ## NOTE: parent.frame(3L) to find this (this function's) environment - ml <- glm(formula = formula, data=x, offset = parent.frame(3L)$offset, family = RPL, ...) - - ## final touches ------------------------------------------------------------- - - ml$call <- match.call() - setattr(ml, "class", c("relpois", "glm", "lm")) - - ml -} - - - - - - -check_excess_cases <- function(d, d.exp, formula, data, enclos = parent.frame(1)) { - # @title Check Excess Counts for a Relative Poisson Model - # @description Checks that the excess counts by strata all exceed 0. - # @param d a vector of observed counts of cases - # @param d.exp a vector of expected counts of cases - # @param a formula, the right side of which is inspected for factor-like - # stratifying variables (factors and character variables) - # @param data a data set to eval formula in its context - # @param enclos passed on to RHS2DT() to evaluate formula to columns; - # enclosing environment of data - PF <- parent.frame(1) - tF <- environment() - - d.exc <- NULL - - by <- RHS2DT(formula, data = data, enclos = enclos) - if (!length(by)) by <- list() - facVars <- names(by)[sapply(by, function(col) is.factor(col) || is.character(col))] - - d <- substitute(d) - d <- eval(d, envir = data, enclos = PF) - d.exp <- substitute(d.exp) - d.exp <- eval(d.exp, envir = data, enclos = PF) - - if (length(facVars)) { - by <- setDT(mget(facVars, as.environment(by))) - } else { - by <- list() - } - - dt <- data.table(d = d, d.exp = d.exp) - dt[, d.exc := d - d.exp] - - for (k in seq_along(names(by))) { - bycol <- names(by)[k] - - tab <- dt[, lapply(.SD, sum), keyby = .(by[[bycol]])][d.exc <= 0L, ] - setnames(tab, 1, bycol) - if (nrow(tab)) { - on.exit(print(tab)) - stop("There are negative excess cases in the data calculated separately ", - "by the factor-like variables ", - paste0("'", facVars, "'", collapse = ", "), ". The model is not ", - "estimable with negative excess cases in strata. ", - "Infracting levels:") - } - - } - - if (!length(by)) { - tab <- dt[, lapply(.SD, sum)] - if (tab$d.exc <= 0L) { - stop("The marginal sum of excess cases is negative; the model cannot ", - "be fitted. ") - } - } - - - - - invisible(NULL) -} - - - - - - -relpois_lex <- function(formula, - data, - pophaz = NULL, - breaks = NULL, - subset = NULL, - check = TRUE, - ...) { - PF <- parent.frame(1) - TF <- environment() - - form <- agVars <- NULL - - - ## checks -------------------------------------------------------------------- - - checkLexisData(data) - checkPophaz(lex = data, ph = pophaz) - if (!is.null(breaks)) checkBreaksList(breaks) - - oldBreaks <- copy(attr(data, "breaks")) - allScales <- copy(attr(data, "time.scales")) - - - ## detect which time scale used ---------------------------------------------- - - survScale <- intersect(all.vars(formula), allScales) - if (length(survScale) > 1L) { - stop("Found several used time scales in formula, which is not supported ", - "(found ", paste0("'", survScale, "'", collapse = ", "), ")") - } - ## subset -------------------------------------------------------------------- - - sb <- substitute(subset) - subset <- evalLogicalSubset(data, sb, enclos = PF) - x <- data[subset, ] - - ## essentially same steps as in survtab() here, maybe make that - ## into a function / generalize lexpand. - - ## splitting ----------------------------------------------------------------- - if (is.numeric(breaks) && length(survScale)) { - breaks <- list(breaks) - names(breaks) <- survScale - } - if (!is.null(breaks)) x <- splitMulti(x, breaks = breaks, drop = TRUE) - newBreaks <- copy(attr(x, "breaks")) - - ## merge in pophaz ----------------------------------------------------------- - haz <- makeTempVarName(x, pre = "haz_") - ph <- data.table(pophaz) - phVars <- setdiff(names(ph), "haz") - setnames(ph, "haz", haz) - x <- cutLowMerge(x, pophaz, by = phVars, all.x = TRUE, all.y = FALSE, - old.nums = TRUE, mid.scales = intersect(allScales, phVars)) - - # expected cases - d.exp <- makeTempVarName(x, pre = "d.exp_") - set(x, j = d.exp, value = x$lex.dur * x[[haz]]) - - ## aggregating --------------------------------------------------------------- - ag <- model.frame(formula[-2], data = x) ## without response - setDT(ag) - set(ag, j = d.exp, value = x[[d.exp]]) - d <- makeTempVarName(x, pre = "d_") - set(ag, j = d, value = eval(form)) - ag <- aggre(x, by = agVars, sum.values = d.exp) - rm(x) - - ag_form <- formula - ag_form[[2]] <- quote(from0to1) - - rp <- relpois_ag(ag_form, data = data, breaks = NULL) - - rp$call <- match.call() - rp$formula <- formula - - rp -} +#' @title Excess hazard Poisson model +#' @author Joonas Miettinen, Karri Seppa +#' @description Estimate a Poisson piecewise constant excess +#' hazards model +#' @param data a dataset split with e.g. \code{\link{lexpand}}; +#' must have expected hazard merged within +#' @param formula a formula which is passed on to \code{glm}; see Details +#' @param fot.breaks optional; a numeric vector of [a,b) breaks to specify +#' survival intervals over the follow-up time; if \code{NULL}, the +#' existing breaks along the mandatory \code{fot} time scale in \code{data} +#' are used (e.g. the breaks for \code{fot} supplied to \code{lexpand}) +#' @param subset a logical vector or condition; e.g. \code{subset = sex == 1}; +#' limits the data before estimation +#' @param check logical; if \code{TRUE}, tabulates excess cases by all +#' factor variables in the formula to check for negative / \code{NA} +#' excess cases before fitting the GLM +#' @param ... any argument passed on to \code{glm} +#' @import stats +#' @details +#' +#' \strong{Basics} +#' +#' \code{relpois} employs a custom link function of the Poisson variety +#' to estimate piecewise constant parametric excess hazards. The pieces +#' are determined by \code{fot.breaks}. A \code{log(person-years)} offset +#' is passed automatically to the \code{glm} call. +#' +#' \strong{Formula usage} +#' +#' The formula can be used like any ordinary \code{glm} formula. The user must +#' define the outcome in some manner, which is usually \code{lex.Xst} after splitting +#' with e.g. \code{lexpand}. The exception is the possibility of including +#' the baseline excess hazard terms by including the +#' reserved term \code{FOT} in the formula. +#' +#' For example, \code{lex.Xst != 0 ~ FOT + agegr} estimates a model with constant +#' excess hazards at the follow-up intervals as specified by +#' the pertinent breaks used in splitting \code{data}, +#' as well as for the different age groups. +#' \code{FOT} is created ad hoc if it is used in the formula. +#' If you leave out \code{FOT}, the hazard is effectively +#' assumed to be constant across the whole follow-up time. +#' +#' You can also simply use your own follow-up time interval variable that +#' you have created before calling \code{relpois}. However, when using +#' \code{FOT}, \code{relpois} automatically checks for e.g. +#' negative excess cases in follow-up intervals, +#' allowing for quickly finding splitting breaks +#' where model estimation is possible. It also drops any data outside the +#' follow-up time window. +#' +#' \strong{Splitting and merging population hazard} +#' +#' The easiest way to both split and to include population hazard information is +#' by using \code{\link{lexpand}}. You may also fairly easily do it by hand +#' by splitting first and then merging in your population hazard information. +#' +#' +#' \strong{Data requirements} +#' +#' The population hazard information must be available for each record and named +#' \code{pop.haz}. The follow-up time variable must be named \code{"fot"} e.g. +#' as a result of using \code{lexpand}. The \code{lex.dur} variable must also +#' be present, containing person-year information. +#' +#' +#' @return +#' A \code{glm} object created using a custom Poisson family construct. Some +#' \code{glm} methods are applicable. +#' +#' @seealso +#' \code{\link{lexpand}}, \code{\link{poisson}}, \code{\link{glm}} +#' @family main functions +#' @family relpois functions +#' @export relpois +#' +#' @examples +#' ## use the simulated rectal cancer cohort +#' data("sire", package = "popEpi") +#' sire$agegr <- cut(sire$dg_age, c(0,45,60,Inf), right=FALSE) +#' +#' ## usable straight away after splitting +#' fb <- c(0,3/12,6/12,1,2,3,4,5) +#' x <- lexpand(sire, birth = bi_date, entry = dg_date, +#' exit = ex_date, status=status, +#' breaks = list(fot=fb), pophaz=popmort) +#' rpm <- relpois(x, formula = lex.Xst %in% 1:2 ~ FOT + agegr) +#' +#' ## some methods for glm work. e.g. test for interaction +#' \dontrun{ +#' rpm2 <- relpois(x, formula = lex.Xst %in% 1:2 ~ FOT*agegr) +#' anova(rpm, rpm2, test="LRT") +#' AIC(rpm, rpm2) +#' ## update won't work currently +#' } + + +relpois <- function(data, + formula, + fot.breaks = NULL, subset = NULL, check=TRUE, ...) { + ## R CMD CHECK appeasement + lex.dur <- NULL + + ## prep arguments ------------------------------------------------------------ + excess_cases <- fot <- pop.haz <- NULL ## appease R CMD CHECK + + ## somehow the class of the data is being altered by this function + oldClass <- class(data) + + if (missing(formula) || !inherits(formula, "formula")) stop("formula not defined") + + form_vars <- all.vars(formula) + dataname <- as.name(deparse(substitute(data))) + + if (!inherits(data, "Lexis")) { + stop("data is not a Lexis object; data must be a result of splitting or using Lexis") + } + + if ("FOT" %in% names(data)) { + stop("FOT is a reserved name but you have a variable with that name in data; rename/delete it first") + } + + # wasDF <- FALSE + if (!is.data.table(data)) { + data <- copy(data) + setDT(data) + message("Took a copy of your data because it was a data.frame and not a data.table. This may take up a lot of memory.") + message("It is recommended to convert your data to data.table before using this function using as.data.table or setDT") + } + + req_vars <- unique(c("lex.id", "fot", "lex.dur", "pop.haz", setdiff(form_vars, "FOT"))) + all_names_present(data, req_vars) + + surv.breaks <- attr(data, "breaks")$fot + if (is.null(surv.breaks)) { + stop("did not find any breaks information in data attributes named 'fot'; + probable reason: split data was edited after splitting - ", + "don't do that") + } else { + if (!is.null(fot.breaks)) { + if (any(!fot.breaks %in% surv.breaks)) { + stop("fot.breaks must be a subset of the breaks for 'fot' used in splitting; + type attr(data, 'breaks')$fot to see the breaks you used in splitting") + } else { + surv.breaks <- fot.breaks + } + } + + } + + ## prep & subset data -------------------------------------------------------- + subset <- substitute(subset) + subset <- evalLogicalSubset(data, subset) + + if (any(is.na(data[subset, ]$pop.haz))) { + stop("some pop.haz are NA") + } + + on.exit({ + setcolsnull(data, c("FOT", tmpdexp), soft = TRUE) + }, add = TRUE) + + if ("FOT" %in% form_vars) { + data[, "FOT" := cut(fot, breaks = surv.breaks, right = FALSE)] + # set(data, j = "FOT", value = cut(data$fot, breaks = surv.breaks, right=FALSE)) + subset <- subset & !is.na(data$FOT) + } + tmpdexp <- makeTempVarName(data, pre = "TEMP_d.exp_") + data[, c(tmpdexp) := pop.haz*lex.dur] + # set(data, j = tmpdexp, value = data$pop.haz * data$lex.dur) + + + if (check) { + ## test for negative excess cases in factor variable combinations ------------ + + ## determine factor variables for cross-tabulating + + fac_vars <- colnames(attr(terms.formula(formula), "factors")) + + fac_list <- paste0(fac_vars, collapse=", ") + fac_list <- paste0("list(", fac_list, ")") + fac_list <- parse(text=fac_list) + + wh_fac <- as.data.table(data)[subset, unlist(lapply(eval(fac_list), is.factor))] + fac_vars <- fac_vars[wh_fac] + + if (length(fac_vars) == 0) fac_vars <- NULL + fac_list <- paste0(fac_vars, collapse=", ") + fac_list <- paste0("list(", fac_list, ")") + fac_list <- parse(text=fac_list) + + + ## test negativity of excess cases + LHS <- as.character(formula) + LHS <- LHS[2] + LHS <- parse(text = LHS) + + excas <- as.data.table(data)[subset, list(excess_cases = sum(eval(LHS)-get(tmpdexp))), keyby=eval(fac_list)] + setnames(excas, 1:ncol(excas), c(fac_vars, "excess_cases")) + + + if (any(is.na(excas$excess_cases))) { + stop("some excess cases were NA; is pop.haz available for all records?") + } + excas <- excas[excess_cases <= 0] + if (any(excas$excess_cases<=0)) { + print(excas) + warning("negative excess cases found in some combinations of factor variables; + see printed table and try e.g. wider FOT intervals") + } + } + + + ## custom poisson family ----------------------------------------------------- + RPL <- copy(poisson()) + RPL$link <- "glm relative survival model with Poisson error" + RPL$linkfun <- function(mu, d.exp = data[[tmpdexp]][subset]) log(mu - d.exp) + RPL$linkinv <- function(eta, d.exp = data[[tmpdexp]][subset]) d.exp + exp(eta) + + + RPL$initialize <- substitute( { + if (any(y < 0)) stop(paste("Negative values not allowed for", + "the Poisson family")) + n <- rep.int(1, nobs) + mustart <- pmax(y, d.exp) + 0.1 + }, list(d.exp = data[[tmpdexp]][subset]) ) + + + + ## glm call ------------------------------------------------------------------ + ## update() won't work + ## anova() works + + + ml <- glm(formula = formula, data=data[subset,], offset=log(lex.dur), + # subset = subset, ## Error in xj[i] : invalid subscript type 'closure' + family = RPL, ...) + + ## final touches ------------------------------------------------------------- + ml$d.exp <- data[subset, ][[tmpdexp]] + ml$FOT <- data[subset, ]$FOT + ml$fot.breaks <- surv.breaks + ml$call$data <- dataname + ml$call$formula <- formula + setattr(ml, "class", c("relpois", "glm", "lm")) + + setattr(data, "class", oldClass) ## see beginning of function + + + ml +} + + + +#' @title Excess hazard Poisson model +#' @author Joonas Miettinen, Karri Seppa +#' @description Estimate a Poisson Piecewise Constant Excess +#' Hazards Model +#' @param formula a formula with the counts of events as the response. +#' Passed on to \code{glm}. May contain usage of the \code{offset()} function +#' instead of supplying the offset for the Poisson model via the argument +#' \code{offset}. +#' @param data an \code{aggre} object (an aggregated data set; +#' see \code{\link{as.aggre}} and \code{\link{aggre}}) +#' @param d.exp the counts of expected cases. Mandatory. +#' E.g. \code{d.exp = EXC_CASES}, where \code{EXC_CASES} is a column in data. +#' @param offset the offset for the Poisson model, supplied as e.g. +#' \code{offset = log(PTIME)}, where \code{PTIME} is a subject-time +#' variable in data. Not mandatory, but almost always should be supplied. +#' @param breaks optional; a numeric vector of [a,b) breaks to specify +#' survival intervals over the follow-up time; if \code{NULL}, the +#' existing breaks along the mandatory time scale mentioned in \code{formula} +#' are used +#' @param subset a logical vector or condition; e.g. \code{subset = sex == 1}; +#' limits the data before estimation +#' @param piecewise \code{logical}; if \code{TRUE}, and if any time scale +#' from data is used (mentioned) in the formula, the time scale is +#' transformed into a factor variable indicating intervals on the time scale. +#' Otherwise the time scale left as it is, usually a numeric variable. +#' E.g. if \code{formula = counts ~ TS1*VAR1}, \code{TS1} is transformed +#' into a factor before fitting model. +#' @param check \code{logical}; if \code{TRUE}, performs check on the +#' negativity excess cases by factor-like covariates in formula - +#' negative excess cases will very likely lead to non-converging model +#' @param ... any other argument passed on to \code{\link[stats]{glm}} such as +#' \code{control} or \code{weights} +#' @import stats +#' +#' @return +#' A \code{relpois} object created using a custom Poisson family construct. +#' +#' @seealso +#' \code{\link{lexpand}}, \code{\link{poisson}}, \code{\link{glm}} +#' @family main functions +#' @family relpois functions +#' @examples +#' ## use the simulated rectal cancer cohort +#' data(sire, package = "popEpi") +#' sire$agegr <- cut(sire$dg_age, c(0,45,60,Inf), right=FALSE) +#' +#' ## create aggregated example data +#' fb <- c(0,3/12,6/12,1,2,3,4,5) +#' x <- lexpand(sire, birth = bi_date, entry = dg_date, +#' exit = ex_date, status=status %in% 1:2, +#' breaks = list(fot=fb), +#' pophaz=popmort, pp = FALSE, +#' aggre = list(agegr, fot)) +#' +#' ## fit model using aggregated data +#' rpm <- relpois_ag(formula = from0to1 ~ fot + agegr, data = x, +#' d.exp = d.exp, offset = log(pyrs)) +#' summary(rpm) +#' +#' ## the usual functions for handling glm models work +#' rpm2 <- update(rpm, . ~ fot*agegr) +#' anova(rpm, rpm2, test="LRT") +#' AIC(rpm, rpm2) +#' +#' ## other features such as residuals or predicting are not guaranteed +#' ## to work as intended. +#' @export + +relpois_ag <- function(formula, data, d.exp, offset = NULL, breaks = NULL, subset = NULL, piecewise = TRUE, check = TRUE, ...) { + + TF <- environment() + PF <- parent.frame(1L) + original_formula <- formula + + if (!inherits(data, "aggre")) { + stop("data is not an aggre object. Please aggregate your data first using ", + "e.g. lexpand(). If your data is pre-aggregated, use as.aggre() to ", + "mark it as such.") + } + + formula <- evalRecursive(formula, env = TF, enc = PF)$arg + if (missing(formula) || !inherits(formula, "formula")) stop("formula not defined") + + + + + ## detect survival time scale ------------------------------------------------ + oldBreaks <- copy(attr(data, "breaks")) + allScales <- names(oldBreaks) + if (is.null(oldBreaks)) { + stop("data does not have breaks information. Is it a result of using ", + "aggre() or as.aggre()?") + } + survScale <- intersect(all.vars(formula), allScales) + if (length(survScale) > 1L) { + stop("Found several used time scales in formula, which is not supported ", + "(found ", paste0("'", survScale, "'", collapse = ", "), ")") + } + + ## check supplied breaks ----------------------------------------------------- + if (is.numeric(breaks)) { + breaks <- list(breaks) + names(breaks) <- survScale + } + + if (!is.null(breaks)) { + if (!all_breaks_in(breaks, oldBreaks)) { + stop("Supplied breaks must be subset of the breaks used in splitting/", + "aggregating data. See the latter using e.g. ", + "attributes(x)$aggre.meta$breaks where x is your aggregated data.") + } + } + + ## pre-find args ------------------------------------------------------------- + desub <- substitute(d.exp) + sub_d.exp <- evalRecursive(desub, env = data[1L, ], enc = PF)$argSub + offsub <- substitute(offset) + sub_offset <- evalRecursive(offsub, env = data[1L, ], enc = PF)$argSub + + ## prep & subset data -------------------------------------------------------- + subset <- substitute(subset) + subset <- evalLogicalSubset(data, subset) + + av <- c(all.vars(formula), all.vars(sub_d.exp), all.vars(sub_offset)) + av <- intersect(names(data), av) + + x <- subsetDTorDF(data, subset = subset, select = av) + + setDT(x) + setattr(x, "class", c("aggre", "data.table", "data.frame")) + + ## handle breaks ------------------------------------------------------------- + if (!is.null(breaks)) { + if (!piecewise) { + stop("Supplied breaks but piecewise = FALSE. Please select piecewise = ", + "TRUE if you want piecewise estimates defined by the breaks.") + } + + } + + cutBreaks <- breaks + othScales <- setdiff(names(oldBreaks), names(cutBreaks)) + cutBreaks[othScales] <- oldBreaks[othScales] + cutBreaks[sapply(cutBreaks, length) < 2L] <- NULL + + if (piecewise && length(cutBreaks)) { + + for (sc in names(cutBreaks)) { + set(x, j = sc, value = cut(x[[sc]], breaks = cutBreaks[[sc]], + right = FALSE, labels = FALSE)) + + pieces <- round(cutBreaks[[sc]], 2L) + pieces <- paste0("[", pieces[-length(pieces)], ", ", pieces[-1L], ")") + set(x, j = sc, value = pieces[x[[sc]]]) + } + + } + + ## eval value args ----------------------------------------------------------- + d.exp <- evalPopArg(x, sub_d.exp, enclos = PF, + DT = TRUE, recursive = TRUE) + if (is.null(d.exp)) stop("argument d.exp was not supplied") + d.exp <- rowSums(d.exp) + if (length(d.exp) == nrow(data)) d.exp <- d.exp[subset] + + offset <- evalPopArg(x, sub_offset, enclos = PF, + DT = TRUE, recursive = TRUE) + if (!is.null(offset)) offset <- rowSums(offset) + if (length(offset) == nrow(data)) offset <- offset[subset] + + ## check excess cases -------------------------------------------------------- + d <- eval(formula[[2]], envir = x, enclos = PF) + check_excess_cases(d = d, d.exp = d.exp, data = x, + formula = formula, enclos = PF) + + ## custom poisson family ----------------------------------------------------- + RPL <- copy(poisson()) + RPL$link <- "glm relative survival model with Poisson error" + RPL$linkfun <- function(mu, d.exp = TF$d.exp) { + log(mu - d.exp) + } + RPL$linkinv <- function(eta, d.exp = TF$d.exp) { + d.exp + exp(eta) + } + + RPL$initialize <- substitute( { + if (any(y < 0)) stop(paste("Negative values not allowed for", + "the Poisson family")) + n <- rep.int(1, nobs) + mustart <- pmax(y, d.exp) + 0.1 + }, list(d.exp = TF$d.exp) ) + + ## glm call ------------------------------------------------------------------ + + ## NOTE: parent.frame(3L) to find this (this function's) environment + ml <- glm(formula = formula, data=x, offset = parent.frame(3L)$offset, family = RPL, ...) + + ## final touches ------------------------------------------------------------- + + ml$call <- match.call() + setattr(ml, "class", c("relpois", "glm", "lm")) + + ml +} + + + + + + +check_excess_cases <- function(d, d.exp, formula, data, enclos = parent.frame(1)) { + # @title Check Excess Counts for a Relative Poisson Model + # @description Checks that the excess counts by strata all exceed 0. + # @param d a vector of observed counts of cases + # @param d.exp a vector of expected counts of cases + # @param a formula, the right side of which is inspected for factor-like + # stratifying variables (factors and character variables) + # @param data a data set to eval formula in its context + # @param enclos passed on to RHS2DT() to evaluate formula to columns; + # enclosing environment of data + PF <- parent.frame(1) + tF <- environment() + + d.exc <- NULL + + by <- RHS2DT(formula, data = data, enclos = enclos) + if (!length(by)) by <- list() + facVars <- names(by)[sapply(by, function(col) is.factor(col) || is.character(col))] + + d <- substitute(d) + d <- eval(d, envir = data, enclos = PF) + d.exp <- substitute(d.exp) + d.exp <- eval(d.exp, envir = data, enclos = PF) + + if (length(facVars)) { + by <- setDT(mget(facVars, as.environment(by))) + } else { + by <- list() + } + + dt <- data.table(d = d, d.exp = d.exp) + dt[, d.exc := d - d.exp] + + for (k in seq_along(names(by))) { + bycol <- names(by)[k] + + tab <- dt[, lapply(.SD, sum), keyby = .(by[[bycol]])][d.exc <= 0L, ] + setnames(tab, 1, bycol) + if (nrow(tab)) { + on.exit(print(tab)) + stop("There are negative excess cases in the data calculated separately ", + "by the factor-like variables ", + paste0("'", facVars, "'", collapse = ", "), ". The model is not ", + "estimable with negative excess cases in strata. ", + "Infracting levels:") + } + + } + + if (!length(by)) { + tab <- dt[, lapply(.SD, sum)] + if (tab$d.exc <= 0L) { + stop("The marginal sum of excess cases is negative; the model cannot ", + "be fitted. ") + } + } + + + + + invisible(NULL) +} + + + + + + +relpois_lex <- function(formula, + data, + pophaz = NULL, + breaks = NULL, + subset = NULL, + check = TRUE, + ...) { + PF <- parent.frame(1) + TF <- environment() + + form <- agVars <- NULL + + + ## checks -------------------------------------------------------------------- + + checkLexisData(data) + checkPophaz(lex = data, ph = pophaz) + if (!is.null(breaks)) checkBreaksList(breaks) + + oldBreaks <- copy(attr(data, "breaks")) + allScales <- copy(attr(data, "time.scales")) + + + ## detect which time scale used ---------------------------------------------- + + survScale <- intersect(all.vars(formula), allScales) + if (length(survScale) > 1L) { + stop("Found several used time scales in formula, which is not supported ", + "(found ", paste0("'", survScale, "'", collapse = ", "), ")") + } + ## subset -------------------------------------------------------------------- + + sb <- substitute(subset) + subset <- evalLogicalSubset(data, sb, enclos = PF) + x <- data[subset, ] + + ## essentially same steps as in survtab() here, maybe make that + ## into a function / generalize lexpand. + + ## splitting ----------------------------------------------------------------- + if (is.numeric(breaks) && length(survScale)) { + breaks <- list(breaks) + names(breaks) <- survScale + } + if (!is.null(breaks)) x <- splitMulti(x, breaks = breaks, drop = TRUE) + newBreaks <- copy(attr(x, "breaks")) + + ## merge in pophaz ----------------------------------------------------------- + haz <- makeTempVarName(x, pre = "haz_") + ph <- data.table(pophaz) + phVars <- setdiff(names(ph), "haz") + setnames(ph, "haz", haz) + x <- cutLowMerge(x, pophaz, by = phVars, all.x = TRUE, all.y = FALSE, + old.nums = TRUE, mid.scales = intersect(allScales, phVars)) + + # expected cases + d.exp <- makeTempVarName(x, pre = "d.exp_") + set(x, j = d.exp, value = x$lex.dur * x[[haz]]) + + ## aggregating --------------------------------------------------------------- + ag <- model.frame(formula[-2], data = x) ## without response + setDT(ag) + set(ag, j = d.exp, value = x[[d.exp]]) + d <- makeTempVarName(x, pre = "d_") + set(ag, j = d, value = eval(form)) + ag <- aggre(x, by = agVars, sum.values = d.exp) + rm(x) + + ag_form <- formula + ag_form[[2]] <- quote(from0to1) + + rp <- relpois_ag(ag_form, data = data, breaks = NULL) + + rp$call <- match.call() + rp$formula <- formula + + rp +} diff --git a/R/relative_poisson_net_survival.R b/R/relative_poisson_net_survival.R index 9125165..44c7806 100644 --- a/R/relative_poisson_net_survival.R +++ b/R/relative_poisson_net_survival.R @@ -1,176 +1,176 @@ -#' @title Marginal piecewise parametric relative survival curve -#' @author Joonas Miettinen -#' @description Fit a marginal relative survival curve based on a \code{relpois} fit -#' @param object a \code{relpois} object -#' @details -#' \pkg{popEpi} version 0.2.1 supported confidence intervals but due to lack -#' of testing this is disabled until the intervals are subjected to more rigorous testing. -#' -#' Currently only estimates a marginal curve, i.e. the average of all -#' possible individual curves. -#' -#' Only supported when the reserved \code{FOT} variable was used in \code{relpois}. -#' Computes a curve for each unique combination of covariates (e.g. 4 sets) -#' and returns a weighted average curve based on the counts -#' of subjects for each combination (e.g. 1000, 125, 50, 25 respectively). -#' Fairly fast when only factor variables have been used, otherwise -#' go get a cup of coffee. -#' -#' If delayed entry is present in data due to period analysis limiting, -#' the marginal curve is constructed only for those whose follow-up started -#' in the respective period. -#' -#' @export -#' @family relpois functions -#' -#' @import data.table -#' @import Epi -#' @import stats -#' -#' @examples -#' \dontrun{ -#' ## use the simulated rectal cancer cohort -#' data("sire", package = "popEpi") -#' ab <- c(0,45,55,65,70,Inf) -#' sire$agegr <- cut(sire$dg_age, breaks = ab, right = FALSE) -#' -#' BL <- list(fot= seq(0,10,1/12)) -#' pm <- data.frame(popEpi::popmort) -#' x <- lexpand(sire, breaks=BL, pophaz=pm, -#' birth = bi_date, -#' entry = dg_date, exit = ex_date, -#' status = status %in% 1:2) -#' -#' rpm <- relpois(x, formula = lex.Xst %in% 1:2 ~ -1+ FOT + agegr, -#' fot.breaks=c(0,0.25,0.5,1:8,10)) -#' pmc <- rpcurve(rpm) -#' -#' ## compare with non-parametric estimates -#' names(pm) <- c("sex", "per", "age", "haz") -#' x$agegr <- cut(x$dg_age, c(0,45,55,65,75,Inf), right = FALSE) -#' st <- survtab(fot ~ adjust(agegr), data = x, weights = "internal", -#' pophaz = pm) -#' -#' -#' plot(st, y = "r.e2.as") -#' lines(y = pmc$est, x = pmc$Tstop, col="red") -#' } -#' -#' -#' - -rpcurve <- function(object = NULL) { - - Tstart <- FOT <- uni_id <- uni_n <- uni_w <- - lo <- hi <- lex.Xst <- NULL ## APPEASE R CMD CHECK - ## sanity checks ------------------------------------------------------------- - if (is.null(object)) stop("no relative Poisson excess hazard model given") - - if (!inherits(object, "relpois")) stop("not a relpois object") - - if (!"FOT" %in% all.vars(object$formula)) stop("No FOT variable in model formula") - - est <- fot <- pop.haz <- delta <- Tstop <- Tstar <- lex.id <- - fot <- lex.multi <- pyrs <- NULL ## appease R CMD CHECK - - ## collate surv.ints, breaks, deltas ----------------------------------------- - fotlevs <- as.factor(sort(as.character(unique(object$model$FOT)))) - fb <- sort(object$fot.breaks) - fb <- data.table(Tstart = fb[-length(fb)], Tstop = fb[-1]) - fb[, FOT := fotlevs] - fb[, delta := Tstop-Tstart] - n_ints <- nrow(fb) - - ## model data / model matrix construction ------------------------------------ - modmat <- data.table(object$data) - if (!"lex.multi" %in% names(modmat)) { - setkey(modmat, lex.id, fot) - modmat[, lex.multi := 1:.N, by = lex.id] - } - setkey(modmat, lex.id, lex.multi) - modmat <- unique(modmat, by = "lex.id") - modmat <- modmat[fot == 0] ## with period data, only non-delayed entries used - modmat <- modmat[rep(1:.N, each = n_ints)] - IDs <- modmat$lex.id - n_matrows <- length(IDs) - - setcolsnull(modmat, keep = c(all.vars(object$formula))) - setcolsnull(modmat, "FOT") - modmat <- cbind(fb[, list(FOT=FOT, lex.dur = delta)], modmat) - modmat[, lex.Xst := factor(levels(as.factor(lex.Xst))[1])] - modmat[, order := 1:.N] - - ## unique sets of covariates only - umodmat <- unique(modmat, by = setdiff(names(modmat), c("lex.dur","lex.Xst","order"))) - umodmat[, uni_id := rep(1:(nrow(umodmat)/n_ints), each=n_ints)] - - setkeyv(umodmat, setdiff(names(modmat), c("lex.dur","lex.Xst","order","uni_id"))) - setkeyv(modmat, setdiff(names(modmat), c("lex.dur","lex.Xst","order","uni_id"))) - - umodmat[, uni_n := umodmat[modmat, list(uni_n = .N/n_ints), by=uni_id]$uni_n] - - setkeyv(umodmat, c("uni_id", "order")) - mean_weights <- umodmat$uni_n - IDs <- umodmat$uni_id - - setcolsnull(umodmat, delete=c("order","uni_id","uni_n")) - - modmat <- stats::model.matrix(object, data=umodmat) - mmattrs <- attributes(modmat) - mmattrs$dimnames <- mmattrs$dim <- NULL - - - ## (unique covariate) subject-specific curve fits ---------------------------- - l <- split(data.table(modmat), IDs) - l <- lapply(l, as.matrix) - attrsetter <- function(obj) { - mostattributes(obj) <- c(attributes(obj), mmattrs) - obj - } - l <- lapply(l, attrsetter) - - epicumgetter <- function(x, ...) { - Epi::ci.cum(ctr.mat = x, ..., alpha = 1-0.95, Exp = TRUE, ci.Exp = TRUE) - } - - tab <- lapply(l, epicumgetter, obj=object, intl = fb$delta); rm(l) - - ## collate & compute relative survivals -------------------------------------- - tab <- lapply(tab, as.data.table) - tab <- rbindlist(tab) - setnames(tab, names(tab), c("est", "lo", "hi", "SE")) - tab[, FOT := fotlevs] - tab[, uni_id := IDs] - tab[, uni_w := mean_weights] - Haz2RS <- function(x) { - sum(exp(-x)*tab$uni_w)/n_matrows - } - tab <- tab[, lapply(list(est=est,lo=lo,hi=hi), `-`)] - tab <- tab[, lapply(list(est=est,lo=lo,hi=hi), exp)] - tab[, `:=`(est=est*mean_weights,lo=lo*mean_weights,hi=hi*mean_weights)] - tab[, FOT := fotlevs] - tab <- tab[, lapply(list(est=est, lo=lo, hi=hi), sum), by = FOT] - tab <- tab[, lapply(list(est=est, lo=lo, hi=hi), function(x){x/(n_matrows/n_ints)}), by = FOT] - - setkey(tab, FOT); setkey(fb, FOT) - tab <- fb[tab] - - ## disabled CI computation in 0.2.2 due to lack of testing & certainty of correctness - setcolsnull(tab, c("lo", "hi")) - - setattr(tab, "class", c("data.table", "data.frame")) - if (!return_DT()) setDFpe(tab) - tab[] -} - -#' @title Relative Poisson family object -#' @author Karri Seppa -#' @description A family object for GLM fitting of relative Poisson models -#' @format -#' A list very similar to that created by \code{poisson()}. -#' @export -#' @family relpois functions -RPL <- copy(poisson()) -RPL$link <- "glm relative survival model with Poisson error" -RPL$linkfun <- function(mu, d.exp) log(mu - d.exp) -RPL$linkinv <- function(eta, d.exp) d.exp + exp(eta) +#' @title Marginal piecewise parametric relative survival curve +#' @author Joonas Miettinen +#' @description Fit a marginal relative survival curve based on a \code{relpois} fit +#' @param object a \code{relpois} object +#' @details +#' \pkg{popEpi} version 0.2.1 supported confidence intervals but due to lack +#' of testing this is disabled until the intervals are subjected to more rigorous testing. +#' +#' Currently only estimates a marginal curve, i.e. the average of all +#' possible individual curves. +#' +#' Only supported when the reserved \code{FOT} variable was used in \code{relpois}. +#' Computes a curve for each unique combination of covariates (e.g. 4 sets) +#' and returns a weighted average curve based on the counts +#' of subjects for each combination (e.g. 1000, 125, 50, 25 respectively). +#' Fairly fast when only factor variables have been used, otherwise +#' go get a cup of coffee. +#' +#' If delayed entry is present in data due to period analysis limiting, +#' the marginal curve is constructed only for those whose follow-up started +#' in the respective period. +#' +#' @export +#' @family relpois functions +#' +#' @import data.table +#' @import Epi +#' @import stats +#' +#' @examples +#' \dontrun{ +#' ## use the simulated rectal cancer cohort +#' data("sire", package = "popEpi") +#' ab <- c(0,45,55,65,70,Inf) +#' sire$agegr <- cut(sire$dg_age, breaks = ab, right = FALSE) +#' +#' BL <- list(fot= seq(0,10,1/12)) +#' pm <- data.frame(popEpi::popmort) +#' x <- lexpand(sire, breaks=BL, pophaz=pm, +#' birth = bi_date, +#' entry = dg_date, exit = ex_date, +#' status = status %in% 1:2) +#' +#' rpm <- relpois(x, formula = lex.Xst %in% 1:2 ~ -1+ FOT + agegr, +#' fot.breaks=c(0,0.25,0.5,1:8,10)) +#' pmc <- rpcurve(rpm) +#' +#' ## compare with non-parametric estimates +#' names(pm) <- c("sex", "per", "age", "haz") +#' x$agegr <- cut(x$dg_age, c(0,45,55,65,75,Inf), right = FALSE) +#' st <- survtab(fot ~ adjust(agegr), data = x, weights = "internal", +#' pophaz = pm) +#' +#' +#' plot(st, y = "r.e2.as") +#' lines(y = pmc$est, x = pmc$Tstop, col="red") +#' } +#' +#' +#' + +rpcurve <- function(object = NULL) { + + Tstart <- FOT <- uni_id <- uni_n <- uni_w <- + lo <- hi <- lex.Xst <- NULL ## APPEASE R CMD CHECK + ## sanity checks ------------------------------------------------------------- + if (is.null(object)) stop("no relative Poisson excess hazard model given") + + if (!inherits(object, "relpois")) stop("not a relpois object") + + if (!"FOT" %in% all.vars(object$formula)) stop("No FOT variable in model formula") + + est <- fot <- pop.haz <- delta <- Tstop <- Tstar <- lex.id <- + fot <- lex.multi <- pyrs <- NULL ## appease R CMD CHECK + + ## collate surv.ints, breaks, deltas ----------------------------------------- + fotlevs <- as.factor(sort(as.character(unique(object$model$FOT)))) + fb <- sort(object$fot.breaks) + fb <- data.table(Tstart = fb[-length(fb)], Tstop = fb[-1]) + fb[, FOT := fotlevs] + fb[, delta := Tstop-Tstart] + n_ints <- nrow(fb) + + ## model data / model matrix construction ------------------------------------ + modmat <- data.table(object$data) + if (!"lex.multi" %in% names(modmat)) { + setkey(modmat, lex.id, fot) + modmat[, lex.multi := 1:.N, by = lex.id] + } + setkey(modmat, lex.id, lex.multi) + modmat <- unique(modmat, by = "lex.id") + modmat <- modmat[fot == 0] ## with period data, only non-delayed entries used + modmat <- modmat[rep(1:.N, each = n_ints)] + IDs <- modmat$lex.id + n_matrows <- length(IDs) + + setcolsnull(modmat, keep = c(all.vars(object$formula))) + setcolsnull(modmat, "FOT") + modmat <- cbind(fb[, list(FOT=FOT, lex.dur = delta)], modmat) + modmat[, lex.Xst := factor(levels(as.factor(lex.Xst))[1])] + modmat[, order := 1:.N] + + ## unique sets of covariates only + umodmat <- unique(modmat, by = setdiff(names(modmat), c("lex.dur","lex.Xst","order"))) + umodmat[, uni_id := rep(1:(nrow(umodmat)/n_ints), each=n_ints)] + + setkeyv(umodmat, setdiff(names(modmat), c("lex.dur","lex.Xst","order","uni_id"))) + setkeyv(modmat, setdiff(names(modmat), c("lex.dur","lex.Xst","order","uni_id"))) + + umodmat[, uni_n := umodmat[modmat, list(uni_n = .N/n_ints), by=uni_id]$uni_n] + + setkeyv(umodmat, c("uni_id", "order")) + mean_weights <- umodmat$uni_n + IDs <- umodmat$uni_id + + setcolsnull(umodmat, delete=c("order","uni_id","uni_n")) + + modmat <- stats::model.matrix(object, data=umodmat) + mmattrs <- attributes(modmat) + mmattrs$dimnames <- mmattrs$dim <- NULL + + + ## (unique covariate) subject-specific curve fits ---------------------------- + l <- split(data.table(modmat), IDs) + l <- lapply(l, as.matrix) + attrsetter <- function(obj) { + mostattributes(obj) <- c(attributes(obj), mmattrs) + obj + } + l <- lapply(l, attrsetter) + + epicumgetter <- function(x, ...) { + Epi::ci.cum(ctr.mat = x, ..., alpha = 1-0.95, Exp = TRUE, ci.Exp = TRUE) + } + + tab <- lapply(l, epicumgetter, obj=object, intl = fb$delta); rm(l) + + ## collate & compute relative survivals -------------------------------------- + tab <- lapply(tab, as.data.table) + tab <- rbindlist(tab) + setnames(tab, names(tab), c("est", "lo", "hi", "SE")) + tab[, FOT := fotlevs] + tab[, uni_id := IDs] + tab[, uni_w := mean_weights] + Haz2RS <- function(x) { + sum(exp(-x)*tab$uni_w)/n_matrows + } + tab <- tab[, lapply(list(est=est,lo=lo,hi=hi), `-`)] + tab <- tab[, lapply(list(est=est,lo=lo,hi=hi), exp)] + tab[, `:=`(est=est*mean_weights,lo=lo*mean_weights,hi=hi*mean_weights)] + tab[, FOT := fotlevs] + tab <- tab[, lapply(list(est=est, lo=lo, hi=hi), sum), by = FOT] + tab <- tab[, lapply(list(est=est, lo=lo, hi=hi), function(x){x/(n_matrows/n_ints)}), by = FOT] + + setkey(tab, FOT); setkey(fb, FOT) + tab <- fb[tab] + + ## disabled CI computation in 0.2.2 due to lack of testing & certainty of correctness + setcolsnull(tab, c("lo", "hi")) + + setattr(tab, "class", c("data.table", "data.frame")) + if (!return_DT()) setDFpe(tab) + tab[] +} + +#' @title Relative Poisson family object +#' @author Karri Seppa +#' @description A family object for GLM fitting of relative Poisson models +#' @format +#' A list very similar to that created by \code{poisson()}. +#' @export +#' @family relpois functions +RPL <- copy(poisson()) +RPL$link <- "glm relative survival model with Poisson error" +RPL$linkfun <- function(mu, d.exp) log(mu - d.exp) +RPL$linkinv <- function(eta, d.exp) d.exp + exp(eta) diff --git a/R/sir.R b/R/sir.R index 0a9fb30..67a291b 100644 --- a/R/sir.R +++ b/R/sir.R @@ -1,1450 +1,1450 @@ -#' @title Calculate SIR or SMR -#' @author Matti Rantanen, Joonas Miettinen -#' @description Poisson modelled standardised incidence or mortality ratios (SIRs / SMRs) i.e. -#' indirect method for calculating standardised rates. SIR is a ratio of observed and expected cases. -#' Expected cases are derived by multiplying the strata-specific population rate with the -#' corresponding person-years of the cohort. -#' -#' @details \code{sir} is a comprehensive tool for modelling SIRs/SMRs with flexible -#' options to adjust and print SIRs, test homogeneity and utilize -#' multi-state data. The cohort data and the variable names for observation -#' counts and person-years are required. -#' The reference data is optional, since the cohort data -#' can be stratified (\code{print}) and compared to total. -#' -#' -#' \strong{Adjust and print} -#' -#' A SIR can be adjusted or standardised using the covariates found in both \code{coh.data} and \code{ref.data}. -#' Variable to adjust are given in \code{adjust}. -#' Variable names needs to match in both \code{coh.data} and \code{ref.data}. -#' Typical variables to adjust by are gender, age group and calendar period. -#' -#' \code{print} is used to stratify the SIR output. In other words, the variables -#' assigned to \code{print} are the covariates of the Poisson model. -#' Variable levels are treated as categorical. -#' Variables can be assigned in both \code{print} and \code{adjust}. -#' This means the output it adjusted and printed by these variables. -#' -#' \code{print} can also be a list of expressions. This enables changing variable -#' names or transforming variables with functions such as \code{cut} and \code{round}. -#' For example, the existing variables \code{agegroup} and \code{year} could be -#' transformed to new levels using \code{cut} by -#' -#' \code{print = list( age.category = cut(agegroup, breaks = c(0,50,75,100)), -#' year.cat = cut(year, seq(1950,2010,20)))} -#' -#' -#' \strong{ref.rate or ref.obs & ref.pyrs} -#' -#' The population rate variable can be given to the \code{ref.rate} parameter. -#' That is, when using e.g. the \code{popmort} or a comparable data file, one may -#' supply \code{ref.rate} instead of \code{ref.obs} and \code{ref.pyrs}, which -#' will be ignored if \code{ref.rate} is supplied. -#' -#' -#' Note that if all the stratifying variables in -#' \code{ref.data} are not listed in \code{adjust}, -#' or when the categories are otherwise combined, -#' the (unweighted) mean of rates is used for computing expected cases. -#' This might incur a small bias in comparison to when exact numbers of observations -#' and person-years are available. -#' -#' -#' -#' \strong{mstate} -#' -#' E.g. using \code{lexpand} it's possible to compute counts for several outcomes -#' so that the population at risk is same for each -#' outcome such as a certain kind of cancer. -#' The transition counts are in wide data format, -#' and the relevant columns can be supplied to \code{sir} -#' in a vector via the \code{coh.obs} argument. -#' The name of the corresponding new column in \code{ref.data} is given in -#' \code{mstate}. It's recommended to include the \code{mstate} variable in \code{adjust}, -#' so the corresponding information should also be available in \code{ref.data}. -#' More examples in sir-vignette. -#' -#' This approach is analogous to where SIRs are calculated separately their -#' own function calls. -#' -#' -#' \strong{Other parameters} -#' -#' \code{univariate} confidence intervals are calculated using exact -#' Poisson intervals (\code{poisson.ci}). The options \code{profile} and \code{wald} are -#' is based on a Poisson regression model: profile-likelihood confidence intervals -#' or Wald's normal-approximation. P-value is Poisson model based \code{conf.type} -#' or calculated using the method described by Breslow and Day. Function automatically -#' switches to another \code{conf.type} if calculation is not possible with a message. -#' Usually model fit fails if there is print stratum with zero expected values. -#' -#' -#' The LRT p-value tests the levels of \code{print}. The test can be either -#' \code{"homogeneity"}, a likelihood ratio test where the model variables defined in -#' \code{print} (factor) is compared to the constant model. -#' Option \code{"trend"} tests if the linear trend of the continuous variable in -#' \code{print} is significant (using model comparison). -#' -#' -#' \strong{EAR: Excess Absolute Risk} -#' -#' Excess Absolute Risk is a simple way to quantify the absolute difference between cohort risk and -#' population risk. -#' Make sure that the person-years are calculated accordingly before using EAR. (when using mstate) -#' -#' Formula for EAR: -#' \deqn{EAR = \frac{observed - expected}{person years} \times 1000.}{EAR = (obs - exp)/pyrs * 1000.} -#' -#' \strong{Data format} -#' -#' The data should be given in tabulated format. That is the number of observations -#' and person-years are represented for each stratum. -#' Note that also individual data is allowed as long as each observations, -#' person-years, and print and adjust variables are presented in columns. -#' The extra variables and levels are reduced automatically before estimating SIRs. -#' Example of data format: -#' -#' \tabular{rrrrr}{ -#' sex \tab age \tab period \tab obs \tab pyrs \cr -#' 0 \tab 1 \tab 2010 \tab 0 \tab 390 \cr -#' 0 \tab 2 \tab 2010 \tab 5 \tab 385 \cr -#' 1 \tab 1 \tab 2010 \tab 3 \tab 308 \cr -#' 1 \tab 2 \tab 2010 \tab 12 \tab 315 -#' } -#' -#' -#' @param coh.data aggregated cohort data, see e.g. \code{\link{lexpand}} -#' @param coh.pyrs variable name for person years in cohort data; quoted or unquoted -#' @param coh.obs variable name for observed cases; quoted or unquoted. A vector when using \code{mstata}. -#' @param ref.data population data. Can be left NULL if \code{coh.data} is stratified in \code{print}. -#' @param ref.rate population rate variable (cases/person-years). Overwrites arguments -#' \code{ref.pyrs} and \code{ref.obs}; quoted or unquoted -#' @param ref.pyrs variable name for person-years in population data; quoted or unquoted -#' @param ref.obs variable name for observed cases; quoted or unquoted -#' @param subset logical condition to select data from \code{coh.data} before any computations -#' @param adjust variable names for adjusting without stratifying output; quoted vector or unquoted list -#' @param print variable names to stratify results; quoted vector or unquoted named list with functions -#' @param mstate set column names for cause specific observations; quoted or unquoted. Relevant only -#' when \code{coh.obs} length is two or more. See details. -#' @param test.type Test for equal SIRs. Test available are 'homogeneity' and 'trend'. -#' @param conf.type Confidence interval type: 'profile'(=default), 'wald' or 'univariate'. -#' @param conf.level Level of type-I error in confidence intervals, default 0.05 is 95\% CI. -#' @param EAR logical; TRUE calculates Excess Absolute Risks for univariate SIRs. -#' (see details) - -#' -#' @examples -#' data(popmort) -#' data(sire) -#' c <- lexpand( sire, status = status, birth = bi_date, exit = ex_date, entry = dg_date, -#' breaks = list(per = 1950:2013, age = 1:100, fot = c(0,10,20,Inf)), -#' aggre = list(fot, agegroup = age, year = per, sex) ) -#' ## SMR due other causes: status = 2 -#' se <- sir( coh.data = c, coh.obs = 'from0to2', coh.pyrs = 'pyrs', -#' ref.data = popmort, ref.rate = 'haz', -#' adjust = c('agegroup', 'year', 'sex'), print = 'fot') -#' se -#' ## for examples see: vignette('sir') -#' -#' -#' @seealso \code{\link{lexpand}} -#' \href{../doc/sir.html}{A SIR calculation vignette} -#' @family sir functions -#' @family main functions -#' -#' @return A sir-object that is a \code{data.table} with meta information in the attributes. -#' -#' @export -#' -#' @import data.table -#' @import stats - - - - -sir <- function( coh.data, - coh.obs, - coh.pyrs, - ref.data = NULL, - ref.obs = NULL, - ref.pyrs = NULL, ref.rate = NULL, - subset = NULL, - print = NULL, - adjust = NULL, - mstate = NULL, - test.type = 'homogeneity', - conf.type = 'profile', - conf.level = 0.95, - EAR = FALSE){ - - coh.data <- data.table(coh.data) - - ## subsetting--------------------------------------------------------------- - ## no copy taken of data! - subset <- substitute(subset) - subset <- evalLogicalSubset(data = coh.data, substiset = subset) - coh.data <- coh.data[subset,] - - - # print list -------------------------------------------------------------- - - # env1 <- environment() # set environment where to assign new print - # coh.data <- data_list(data = coh.data, arg.list = substitute(print), env = env1) - - mstate <- as.character(substitute(mstate)) - if(length(mstate) == 0) { - mstate <- NULL - } - if(!is.null(mstate)) { - coh.data[,(mstate) := 0L] - } - - # evalPopArg - coh.obs <- substitute(coh.obs) - c.obs <- evalPopArg(data = coh.data, arg = coh.obs) - coh.obs <- names(c.obs) - - coh.pyrs <- substitute(coh.pyrs) - c.pyr <- evalPopArg(data = coh.data, arg = coh.pyrs) - coh.pyrs <- names(c.pyr) - - print <- substitute(print) - c.pri <- evalPopArg(data = coh.data, arg = print) - print <- names(c.pri) - - adjust <- substitute(adjust) - c.adj <- evalPopArg(data = coh.data, arg = adjust) - adjust <- names(c.adj) - - # collect data - coh.data <- cbind(c.obs, c.pyr) - if(!is.null(print)) coh.data <- cbind(coh.data, c.pri) - if(!is.null(adjust)) coh.data <- cbind(coh.data, c.adj) - - if( !is.null(ref.data) ){ - ref.obs <- as.character(substitute(ref.obs)) - ref.pyrs <- as.character(substitute(ref.pyrs)) - ref.rate <- as.character(substitute(ref.rate)) - - if (length(ref.obs) == 0) ref.obs <- NULL - if (length(ref.pyrs) == 0) ref.pyrs <- NULL - if (length(ref.rate) == 0) ref.rate <- NULL - } - - - # print(coh.data) - - st <- sir_table( coh.data = coh.data, - coh.obs = coh.obs, - coh.pyrs = coh.pyrs, - ref.data = ref.data, - ref.obs = ref.obs, - ref.pyrs = ref.pyrs, - ref.rate = ref.rate, - print = print, - adjust = adjust, - mstate = mstate) - - results <- sir_est( table = st, - print = print, - adjust = adjust, - conf.type = conf.type, - test.type = test.type, - conf.level = conf.level, - EAR = EAR) - - ## final touch --------------------------------------------------------------- - - - #setDT(data) - if (!return_DT()) { - for (i in 1:3) { - if (!is.null(results[[i]])) { - setDFpe(results[[i]]) - } - } - } - - data <- copy(results[[2]]) - setattr(data, name = 'sir.meta', value = list(adjust = adjust, - print = print, - call = match.call(), - lrt.test= results$'lrt.test', - conf.type = results$'conf.type', - conf.level = conf.level, - lrt.test.type = results$'test.type', - pooled.sir = results[[1]])) - setattr(data, "class", c("sir", "data.table", "data.frame")) - return(data) -} - - -#' @title Estimate splines for SIR or SMR -#' @author Matti Rantanen, Joonas Miettinen -#' -#' @description Splines for standardised incidence or mortality ratio. A useful -#' tool to e.g. check whether a constant SIR can be assumed for all calendar periods, -#' age groups or follow-up intervals. Splines can be fitted for these time dimensions -#' separately or in the same model. -#' -#' @param coh.data cohort data with observations and at risk time variables -#' @param coh.pyrs variable name for person-years in cohort data -#' @param coh.obs variable name for observed cases -#' @param ref.data aggregated population data -#' @param ref.rate population rate observed/expected. This overwrites the parameters -#' \code{ref.pyrs} and \code{ref.obs}. -#' @param ref.pyrs variable name for person-years in population data -#' @param ref.obs variable name for observed cases -#' @param subset logical condition to subset \code{coh.data} before any computations -#' @param adjust variable names for adjusting the expected cases -#' @param print variable names for which to estimate SIRs/SMRs and -#' associated splines separately -#' @param mstate set column names for cause specific observations. Relevant only -#' when coh.obs length is two or more. See help for \code{sir}. -#' @param spline variable name(s) for the splines -#' @param knots number knots (vector), pre-defined knots (list of vectors) or for optimal number of knots left NULL -#' @param dependent.splines logical; if TRUE, all splines are fitted in same model. -#' @param reference.points fixed reference values for rate ratios. If left \code{NULL} -#' the smallest value is the reference point (where SIR = 1). -#' Ignored if \code{dependent.splines = FALSE} -#' -#' -#' @details -#' -#' See \code{\link{sir}} for help on SIR/SMR estimation in general; usage of splines -#' is discussed below. -#' -#' \strong{The spline variables} -#' -#' The model can include one, two or three splines variables. -#' Variables can be included in the same model selecting \code{dependent.splines = TRUE} -#' and SIR ratios are calculated (first one is the SIR, others SIR ratios). -#' Reference points vector can be set via \code{reference.points} -#' where first element of the vector is the reference point for first ratio. -#' -#' Variable(s) to fit splines are given as a vector in argument \code{spline}. -#' Order will affect the results. -#' -#' -#' \strong{dependent.splines} -#' -#' By default dependent.splines is FALSE and all splines are fitted in separate models. -#' If TRUE, the first variable in \code{spline} is a function of a SIR and other(s) are ratios. -#' -#' \strong{knots} -#' -#' There are three options to set knots to splines: -#' -#' Set the number of knots for each spline variable with a \strong{vector}. -#' The knots are automatically placed to the quantiles of observed cases in cohort data. -#' The first and last knots are always the maximum and minimum values, so knot -#' value needs to be at least two. -#' -#' Predefined knot places can be set with a \strong{list} of vectors. -#' The vector for each spline in the list specifies the knot places. The lowest -#' and the largest values are the boundary knots and these should be checked beforehand. -#' -#' If \code{knots} is left \strong{NULL}, the model searches the optimal number -#' of knots by model AIC by fitting models iteratively from 2 to 15 knots and -#' the one with smallest AIC is selected. -#' If \code{dependent.splines = TRUE}, the number of knots is searched by fitting each spline -#' variable separately. -#' -#' -#' \strong{print} -#' -#' Splines can be stratified by the levels of variable given in \code{print}. If -#' \code{print} is a vector, only the first variable is accounted for. The knots -#' are placed globally for all levels of \code{print}. This also ensures that the likelihood -#' ratio test is valid. -#' Splines are also fitted independently for each level of \code{print}. -#' This allows for searching interactions, e.g. by fitting spline for period -#' (\code{splines='period'}) for each age group (\code{print = 'agegroup'}). -#' -#' -#' \strong{p-values} -#' -#' The output p-value is a test of whether the splines are equal (homogenous) -#' at different levels of \code{print}. -#' The test is based on the likelihood ratio test, where the full model -#' includes \code{print} and is -#' compared to a null model without it. -#' When \code{(dependent.splines = TRUE)} the p-value returned is a global p-value. -#' Otherwise the p-value is spline-specific. -#' -#' -#' @return A list of data.frames and vectors. -#' Three spline estimates are named as \code{spline.est.A/B/C} and the corresponding values -#' in \code{spline.seq.A/B/C} for manual plotting -#' -#' -#' @seealso \code{\link{splitMulti}} -#' \href{../doc/sir.html}{A SIR calculation vignette} -#' @family sir functions -#' @family main functions -#' -#' @export sirspline -#' @import data.table -#' @import splines -#' @import stats -#' -#' @examples \dontrun{ -#' ## for examples see: vignette('sir') -#' } - -sirspline <- function( coh.data, - coh.obs, - coh.pyrs, - ref.data = NULL, - ref.obs = NULL, - ref.pyrs = NULL, - ref.rate = NULL, - subset = NULL, - print = NULL, - adjust = NULL, - mstate = NULL, - spline, - knots = NULL, - reference.points = NULL, - dependent.splines = TRUE){ - - coh.data <- data.table(coh.data) - - ## subsetting----------------------------------------------------------------- - ## no copy taken of data! - subset <- substitute(subset) - subset <- evalLogicalSubset(data = coh.data, substiset = subset) - coh.data <- coh.data[subset,] - - # print list -------------------------------------------------------------- - - env1 <- environment() - coh.data <- data_list(data = coh.data, arg.list = substitute(print), env = env1) - - mstate <- as.character(substitute(mstate)) - if(length(mstate) == 0) { - mstate <- NULL - } - if(!is.null(mstate)) { - coh.data[,(mstate) := 0L] - } - - # evalPopArg - - spline <- substitute(spline) - c.spl <- evalPopArg(data = coh.data, arg = spline) - spline <- names(c.spl) - - coh.obs <- substitute(coh.obs) - c.obs <- evalPopArg(data = coh.data, arg = coh.obs) - coh.obs <- names(c.obs) - - coh.pyrs <- substitute(coh.pyrs) - c.pyr <- evalPopArg(data = coh.data, arg = coh.pyrs) - coh.pyrs <- names(c.pyr) - - print <- substitute(print) - c.pri <- evalPopArg(data = coh.data, arg = print) - print <- names(c.pri) - - adjust <- substitute(adjust) - c.adj <- evalPopArg(data = coh.data, arg = adjust) - adjust <- names(c.adj) - - # collect data - coh.data <- cbind(c.obs, c.pyr, c.spl) - if(!is.null(print)) { - coh.data <- cbind(coh.data, c.pri[, print[!print %in% spline], with=FALSE]) - } - if(!is.null(adjust)) { - coh.data <- cbind(coh.data, c.adj[, adjust[!adjust %in% spline], with=FALSE]) - } - - if( !is.null(ref.data) ){ - ref.obs <- as.character(substitute(ref.obs)) - ref.pyrs <- as.character(substitute(ref.pyrs)) - ref.rate <- as.character(substitute(ref.rate)) - - if (length(ref.obs) == 0) ref.obs <- NULL - if (length(ref.pyrs) == 0) ref.pyrs <- NULL - if (length(ref.rate) == 0) ref.rate <- NULL - } - - st <- sir_table( coh.data = coh.data, - coh.obs = coh.obs, - coh.pyrs = coh.pyrs, - ref.data = ref.data, - ref.obs = ref.obs, - ref.pyrs = ref.pyrs, ref.rate = ref.rate, - print = print, - adjust = adjust, - mstate = mstate, - spline = spline) - - results <- sir_spline( table = st, - print = print, - adjust = adjust, - spline = spline, - knots = knots, - reference.points = reference.points, - dependent.splines = dependent.splines) - - setclass(results, c('sirspline', 'pe', class(results))) - return(results) -} - - - - - - -# Input: two data.table:s -# output: one data.table including rates -#' @import stats -#' @import data.table -sir_table <- function( coh.data, - coh.obs, - coh.pyrs, - ref.data = NULL, - ref.obs = NULL, - ref.pyrs = NULL, - ref.rate = NULL, - print = NULL, - adjust = NULL, - spline = NULL, - mstate = NULL) { - - - # initial checks ------------------------------------------------- - - if(is.null(ref.data)) { - if(is.null(print)){ - stop('Both ref.data and print cannot be NULL.') - } - ref.data <- data.table(coh.data) - ref.obs <- coh.obs - ref.pyrs <- coh.pyrs - } - - coh.data <- data.table(coh.data) - ref.data <- data.table(ref.data) - - vl <- unique( c(coh.pyrs, coh.obs, adjust, print) ) - if( !is.null(mstate) ) { - vl <- vl[which( vl != mstate )] - } - all_names_present(coh.data, vl ) - - if ( !is.null(ref.pyrs) & !is.null(ref.obs) ) { - all_names_present(ref.data, c(ref.pyrs, ref.obs, adjust)) - } - - # Melt lexpand data ------------------------------------------------------- - - if( length(coh.obs) > 1 ) { - if( is.null(mstate) ){ - stop('coh.obs length is > 1. Set variable name for mstate.') - } - if( !mstate %in% names(ref.data) ){ - warning('mstate variable name does not match names in ref.data.') - } - - aggre <- unique(c(adjust, print, spline, coh.pyrs)) - aggre <- aggre[which(aggre != mstate)] - - coh.data <- melt( data = coh.data, id.vars = aggre, measure.vars = coh.obs, - value.name = 'coh.observations', - variable.name = mstate, variable.factor = FALSE) - coh.obs <- 'coh.observations' - - # parse Y name form string 'formXtoY' - q <- quote( - robust_values(substr(get(mstate), - start = regexpr( pattern = 'to', text = get(mstate) ) + 2, - stop = nchar(x = get(mstate) ))) - ) - coh.data[,(mstate) := eval(q) ] - - if( !(mstate %in% adjust)) { - warning('Consider including mstate variable also in adjust. See help(sir) for details.') - } - } - - # prepare data steps, reduce dimensions ----------------------------------- - - setnames(coh.data, c(coh.obs, coh.pyrs), c('coh.observations','coh.personyears')) - - - coh.data <- expr.by.cj(data = coh.data, - by.vars = unique( sort(c(adjust, print, spline)) ), - expr = list(coh.observations = sum(coh.observations), - coh.personyears = sum(coh.personyears))) - #coh.data <- na2zero(coh.data) - #coh.data <- na.omit(coh.data) - - coh.data[is.na(coh.observations), coh.observations := 0] - coh.data[is.na(coh.personyears), coh.personyears := 0] - coh.data <- na.omit(coh.data) - - # rates - if( !is.null(ref.rate) ){ - setnames(ref.data, ref.rate, 'ref.rate') - ref.data <- expr.by.cj(data = ref.data, by.vars = c(adjust), - expr = list(ref.rate = mean(ref.rate))) - } else { - setnames(ref.data, c(ref.obs, ref.pyrs), c('ref.obs','ref.pyrs')) - ref.data <- expr.by.cj(data = ref.data, by.vars = c(adjust), - expr = list(ref.obs = sum(ref.obs), - ref.pyrs= sum(ref.pyrs))) - ref.data[, ref.rate := ref.obs / ref.pyrs ] - } - - # Merge - sir.table <- merge(coh.data, ref.data, by=c(adjust), all.x=TRUE) - sir.table[, expected := ref.rate * coh.personyears] - sir.table <- na2zero(sir.table) - - if ( !is.null(print) | !is.null(spline)){ - sir.table <- sir.table[ ,list(observed = sum(coh.observations), - expected = sum(expected), - pyrs = sum(coh.personyears)), - by = c(unique(c(print, spline)))] - setkeyv(sir.table, c(print, spline)) - } - else { - sir.table <- sir.table[ ,list(observed = sum(coh.observations), - expected = sum(expected), - pyrs = sum(coh.personyears))] - } - return(sir.table) -} - - - - -# Input: sir.table -# Output: list of data.tables and values -sir_est <- function( table, - print = NULL, - adjust = NULL, - EAR = FALSE, - test.type = 'homogeneity', - conf.level = 0.95, - conf.type = 'profile') { - pyrs <- NULL ## APPEASE R CMD CHECK - setDT(table) - - if(!is.numeric(conf.level) | conf.level > 1) { - stop('Confidence level must be a numeric value between 0-1') - } - # function to SIR p-value - chi.p <- function(o, e) { - pchisq( ( (abs(o - e) - 0.5)^2)/e, df=1, lower.tail=FALSE) - } - - # total sir - combined <- data.table(table)[,list(observed = sum(observed), - expected = sum(expected), - pyrs = sum(pyrs))] - combined[ ,':='(sir = observed/expected, - sir.lo = poisson.ci(observed, expected, conf.level=conf.level)[,4], - sir.hi = poisson.ci(observed, expected, conf.level=conf.level)[,5], - p_value = chi.p(observed, expected))] - - # Poisson regression ------------------------------------------------------ - - # write model formula - fa <- a <- NULL - sir.formula <- paste('observed ~ 1') - if(!is.null(print)){ - fa <- rev(print) # fa <- print - - # drop variables with only one value - u <- c(t(table[, lapply(.SD, uniqueN), .SDcols = fa])) - if (length(u[u==1]) > 0){ - message('Variable "', paste(fa[which(u==1)], collapse = '","'),'" (has only one level) removed from model.') - fa <- fa[-which(u==1)] - } - if(length(fa)>0){ - # model formula - a <- paste0('as.factor(',paste( fa, collapse = '):as.factor('),')') - sir.formula <- paste('observed ~ 0 +', a) - } - } - # fit model if possible ----------------------------------------------------- - - fit <- tryCatch(do.call("glm", list(formula = terms(as.formula(sir.formula), keep.order = FALSE), - offset = log(table[,expected]), - data = table, family = poisson(log))), - error=function(f) NULL ) - - if(!is.null(fit)) eg <- expand.grid(fit$xlevels) # for further testing - - - # LRT test (homogeneity or trend) -------------------------------------------- - - test.type <- match.arg(test.type, c('homogeneity','trend')) - - lrt_sig <- NULL - if( sir.formula != 'observed ~ 1' & !is.null(fit) ) { - if (test.type == 'homogeneity') covariates <- a - if (test.type == 'trend') covariates <- paste(print, collapse=' + ') - - fit_full <- tryCatch( - do.call("glm", list(formula = terms(as.formula( paste0('observed ~ 1 + ', a) )), - offset = log(table[,expected]), - data = table, family=poisson(log))), - error=function(f) NULL ) - - fit_null <- tryCatch( - do.call("glm", list(formula = terms(as.formula('observed ~ 1') ), - offset = log(table[,expected]), - data = table, family=poisson(log))), - error=function(f) NULL ) - - if (!is.null(fit_full)){ - lrt <- anova(fit_full, fit_null, test = 'Chisq') - lrt_sig <- lrt[['Pr(>Chi)']][2] - } - } - - # confidence intervals ---------------------------------------------------- - - conf.type <- match.arg(conf.type, c('wald','profile','univariate')) - ci.info <- NULL - ci <- NULL - - if (is.null(fit) & conf.type %in% c('wald','profile')) { - conf.type <- 'univariate' - ci.info <- 'Model fitting failed. Univariate confidence intervals selected.' - if(any(table$expected == 0)) { - ci.info <- paste(ci.info, '(zero values in expected)') - } - } - - if (conf.type == 'profile') { - - confint_glm <- function(object, parm, level = 0.95, trace = FALSE, ...) { - pnames <- names(coef(object)) - if (missing(parm)) { - parm <- seq_along(pnames) - } - else if (is.character(parm)) { - parm <- match(parm, pnames, nomatch = 0L) - } - object <- profile(object, which = parm, alpha = (1 - level)/4, trace = trace) - confint(object, parm = parm, level = level, trace = trace, ...) - } - - ci <- suppressMessages( suppressWarnings( - tryCatch(exp(confint_glm(fit, level=conf.level)), error=function(e) NULL ) - )) - if(!is.null(ci)) { - ci <- as.data.table(ci) - if (is.null(print) | length(fa)==0) ci <- data.table(t(ci)) # transpose if only one row - } else { - conf.type <- 'wald' - ci.info <- 'Could not solve profile-likelihood. Wald confidence intervals selected.' - } - } - - if (conf.type == 'wald') { - ci <- data.table( exp(confint.default(fit)) ) - } - - if(conf.type == 'univariate') { - ci <- data.table(poisson.ci(table$observed, table$expected, conf.level = conf.level))[,.(lower, upper)] - pv <- chi.p(table$observed, table$expected) - } else { - pv <- as.vector(summary(fit)$coef[, "Pr(>|z|)"]) - } - if(!is.null(ci.info)) message(ci.info) - - # collect results ----------------------------------------------------- - - setnames(ci, 1:2, c('sir.lo','sir.hi')) - - table[, ':=' ( sir = observed/expected, - sir.lo = ci[, sir.lo], - sir.hi = ci[, sir.hi], - p_value = round(pv,5))] - - - # Round results ----------------------------------------------------------- - - cols1 <- c('sir','sir.lo','sir.hi','expected','pyrs') - - table[,(cols1) := lapply(.SD, round, digits=4), .SDcols=cols1] - combined[,(cols1) := lapply(.SD, round, digits=4), .SDcols=cols1] - - - # tests ----------------------------------- - - if (table[!is.na(sir) & (sir < sir.lo | sir > sir.hi), .N] > 0) { - warning('There is something wrong with confidence intervals') - } - if (table[!is.na(sir.lo) & !is.na(sir.hi)][sir.lo > sir.hi, .N] > 0) { - warning('CIs might be incorrect') - } - - if(!is.null(fit) & length(fa)>0) { - # pseudo test if the modelled confidence intervals are merged correctly: - t1 <- copy(table)[,lapply(.SD, factor),.SDcols = fa] - if(any(t1 != data.table(eg))) { - message('CIs levels might not match. Contact the package maintainer and use univariate CIs.') - } - } - - # EAR ----------------------------------------------------------------- - if (EAR) { - table[,EAR := round((observed - expected)/pyrs * 1000, 3)] - } - - - results <- list(total = combined, - table = table, - adjusted = adjust, - lrt.test = lrt_sig, - test.type = test.type, - conf.type = conf.type, - ci.info = ci.info) - return(results) -} - - -#' @export -getCall.sir <- function (x, ...) { - attributes(x)$sir.meta$call -} - - -# Input: sir.table -# Output: estimates and sequences for plotting splines -#' @import splines -#' @import data.table -#' @import stats -sir_spline <- function( table, - print = NULL, - adjust = NULL, - spline, - knots = NULL, - reference.points = NULL, - dependent.splines = TRUE){ - knts <- - spline.seq.A <- - spline.seq.B <- - spline.seq.C <- - spline.est.A <- - spline.est.B <- - spline.est.C <- NULL - - if (!is.null(knots) & length(knots) != length(spline) ) { - stop('Arguments spline and knots has to be same length.') - } - - - # Spline functions ------------------------------------------------------- - - # function to get spline seq - spline.seq <- function(data, spline.var=NULL) { - # palauttaa jotaina - if(is.na(spline.var)) { - return(NULL) - } - spline.seq <- seq( min( data[,get(spline.var)] ), - max( data[,get(spline.var)] ), length.out = 100) - return(spline.seq) - } - - # function to search optimal number of knots by AIC - spline.knots <- function(data, knots = NULL, spline.vars = NULL){ - # search optimal number of knots - if( is.null(knots) ) { - knts <- list() - for (jj in 1:length(spline.vars)) { - # reduce data to fit model - data0 <- data[,list(observed=sum(observed), expected = sum(expected)), by = eval(spline.vars[jj])] - data0 <- data0[expected > 0] - spline.fit <- glm(observed ~ 1, offset=log(expected), family=poisson(log), data = data0) - aic0 <- summary(spline.fit)[['aic']] - limit <- 20 - ii <- 2 - while( ii < limit ){ - tmp.knots <- ii - knts[jj] <- list( data0[ ,quantile( rep(get(spline.vars[jj]),observed), probs = seq(0,100,length.out = tmp.knots)/100)] ) - spline.fit <- glm(observed ~ Ns(get(spline.vars[jj]), knots = knts[[jj]]), offset=log(expected), family=poisson(log), data=data0) - aic0 <- c(aic0, summary(spline.fit)[['aic']]) - ii <- ii + 1 - } - tmp.knots <- which(aic0 == min(aic0))[1] - if(tmp.knots == 1) { - message(paste0('Null model better than spline in ', jj)) - tmp.knots <- 2 - } - knts[jj] <- list(data0[ ,quantile( rep(get(spline.vars[jj]),observed), probs = seq(0,100,length.out = tmp.knots)/100)]) - rm(tmp.knots) - } - knots <- unlist(lapply(knts, length)) - } - else { - # knot predefined - if( is.list(knots) ){ - knts <- knots - knots <- unlist(lapply(knots, length)) - } - # knot number predefined - else { - if( any(knots < 2) ) { - message('Min knots number set to 2.') - knots[knots < 2] <- 2 - } - knts <- list() - for(i in 1:length(knots)) { - knts[i] <- list( data[ ,quantile( rep(get(spline.vars[i]), observed), probs = seq(0,100,length.out = knots[i])/100)]) - } - } - } - names(knts) <- spline.vars - return(knts) - } - - # function to estimate 2-3 dim splines in same model - spline.estimates.dep <- function(sir.spline = sir.spline, - spline.seq.A = spline.seq.A, - spline.seq.B = spline.seq.B, - spline.seq.C = spline.seq.C, - reference.points = reference.points, - knts = knts - ){ - - if( all(!is.null(reference.points), (length(reference.points) + 1) != length(spline)) ){ - stop('Parameter reference.points length should be length of spline - 1.') - } - - - form <- 'Ns(get(spline[[1]]), kn=knts[[1]])' - nsA <- Ns( spline.seq.A, knots = knts[[1]]) - if ( length(spline) >= 2) { - form <- paste0(form, ' + Ns(get(spline[[2]]), kn=knts[[2]])') - nsB <- Ns( spline.seq.B, knots = knts[[2]]) - } - if ( length(spline) == 3) { - form <- paste0(form, ' + Ns(get(spline[[3]]), kn=knts[[3]])') - nsC <- Ns( spline.seq.C, knots = knts[[3]]) - } - - form <- paste0('observed ~ ', form) - spline.fit <- do.call("glm", list(formula = as.formula(form), - offset = log(sir.spline[expected > 0,expected]), - family = poisson, - data = sir.spline[expected>0])) - if( any( ci.exp(spline.fit)[,1] == 1) ){ - message("NA's in spline estimates.") - } - - aic <- summary(spline.fit)[['aic']] - - rf.C <- rf.B <- NA - # set assigned reference points or get minimum values - if( !is.null(reference.points) ) { - rf.B <- reference.points[1] - rf.C <- reference.points[2] - } - else { - rf.B <- min( sir.spline[,get(spline[2])] ) - if(!is.na(spline[3])) { - rf.C <- min( sir.spline[,get(spline[3])] ) - } - } - - if( !is.na(rf.B) ) { - B <- Ns( rep(rf.B, 100), knots = knts[[2]]) - if( findInterval(rf.B, range(sir.spline[,get(spline[2])])) != 1 ) { - message("WARNING: reference point 2 doesn't fall into spline variable interval") - } - } - - if( !is.na(rf.C) ){ - C <- Ns( rep(rf.C, 100), knots = knts[[3]]) - if( findInterval(rf.C, range(sir.spline[,get(spline[3])])) != 1) { - message("WARNING: reference point 3 doesn't fall into spline variable interval") - } - } - - # make subset of model parameters - if( !is.null(knts[2]) ) { - sub.B <- which( grepl('spline[[2]]', names(spline.fit$coefficients),fixed = TRUE) ) - } - if( !is.null(knts[3]) ) { - sub.C <- which( grepl('spline[[3]]', names(spline.fit$coefficients),fixed = TRUE) ) - } - if ( length(spline) == 2) { - spline.est.A <- ci.exp(spline.fit, ctr.mat = cbind(1, nsA, nsB)) - spline.est.B <- ci.exp(spline.fit, subset = sub.B, ctr.mat = nsB - B) - spline.est.C <- NULL - } - if ( length(spline) == 3) { - spline.est.A <- ci.exp(spline.fit, ctr.mat = cbind(1, nsA, nsB, nsC)) - spline.est.B <- ci.exp(spline.fit, subset= sub.B, ctr.mat = nsB - B) - spline.est.C <- ci.exp(spline.fit, subset= sub.C, ctr.mat = nsC - C) - } - list(a = spline.est.A, - b = spline.est.B, - c = spline.est.C) - } - - # function to estimate independet splines - spline.estimates.uni <- function(data, spline.var, spline.seq, knots, knum) { - if(is.na(spline.var)) return(NULL) - knots <- knots[[knum]] - data <- data[,list(observed=sum(observed), expected = sum(expected)), by = eval(spline.var)][expected > 0] - spline.uni <- glm(observed ~ Ns(get(spline.var), knots = knots), offset=log(expected), family=poisson(log), data = data) - nsx <- Ns( spline.seq, knots = knots) - spline.est <- ci.exp(spline.uni, ctr.mat = cbind(1, nsx)) - spline.est - } - - - - # Poisson regression Splines ------------------------------------------------- - - sir.spline <- data.table(table) - - # convert spline variables to numeric - temp.fun <- function(x){ - as.numeric(as.character(x)) - } - sir.spline[, (spline) := lapply(.SD, temp.fun), .SDcols = spline] - - - - # set knots - knts <- spline.knots(data=sir.spline, knots = knots, spline.vars = spline) - - # set sequences - spline.seq.A <- spline.seq(data=sir.spline, spline.var=spline[1]) - spline.seq.B <- spline.seq(data=sir.spline, spline.var=spline[2]) - spline.seq.C <- spline.seq(data=sir.spline, spline.var=spline[3]) - - if( length(spline) == 1 ) { - dependent.splines <- FALSE - } - - # convert print to factor - print <- print[1] - - # loop for each level of print: - if( !is.null(print) ) { - prnt.levels <- sir.spline[,unique( get(print) )] - sir.spline[,(print) := factor(get(print))] - } - else { - print <- 'temp' - sir.spline[,temp := 1] - prnt.levels <- 1 - } - - spline.est.A <- NULL - spline.est.B <- NULL - spline.est.C <- NULL - - for(i in prnt.levels){ - if( dependent.splines ) { - out <- spline.estimates.dep(sir.spline = sir.spline[get(print) == i], - spline.seq.A = spline.seq.A, - spline.seq.B = spline.seq.B, - spline.seq.C = spline.seq.C, - reference.points = reference.points, - knts = knts) - est.A <- out[['a']] - est.B <- out[['b']] - est.C <- out[['c']] - } - else{ - est.A <- spline.estimates.uni(data = sir.spline[get(print) == i], spline.var = spline[1], spline.seq = spline.seq.A, knots = knts, knum = 1) - est.B <- spline.estimates.uni(data = sir.spline[get(print) == i], spline.var = spline[2], spline.seq = spline.seq.B, knots = knts, knum = 2) - est.C <- spline.estimates.uni(data = sir.spline[get(print) == i], spline.var = spline[3], spline.seq = spline.seq.C, knots = knts, knum = 3) - } - - add_i <- function(est.x, i){ - if(is.null(est.x)) { - return(NULL) - } - cbind(i, data.frame(est.x)) - } - - - est.A <- add_i(est.A, i) - est.B <- add_i(est.B, i) - est.C <- add_i(est.C, i) - - spline.est.A <- rbind(spline.est.A, est.A) - spline.est.B <- rbind(spline.est.B, est.B) - spline.est.C <- rbind(spline.est.C, est.C) - } - - # get p-value and anova-table - anovas <- NULL - p <- NULL - if(dependent.splines) { - form.a <- 'Ns(get(spline[[1]]), kn=knts[[1]]) + Ns(get(spline[[2]]), kn=knts[[2]])' - form.b <- 'get(print):Ns(get(spline[[1]]), kn=knts[[1]]) + get(print):Ns(get(spline[[2]]), kn=knts[[2]])' - if ( length(spline) == 3) { - form.a <- paste0(form.a, ' + Ns(get(spline[[3]]), kn=knts[[3]])') - form.b <- paste0(form.b, ' + get(print):Ns(get(spline[[3]]), kn=knts[[3]])') - } - - fit.fun <- function( form.string ){ - do.call("glm", list(formula = as.formula( form.string ), - offset = log(sir.spline[expected > 0,expected]), - family = poisson, - data = sir.spline[expected>0])) - } - - fit.1 <- fit.fun( paste0('observed ~ ', form.a) ) - fit.2 <- fit.fun( paste0('observed ~ ', 'get(print)+', form.a)) - fit.3 <- fit.fun( paste0('observed ~ ', form.b)) - fit.4 <- fit.fun( paste0('observed ~ ', 'get(print)+', form.b) ) - - global.p<- anova(fit.4, fit.1, test='LRT') - level.p <- anova(fit.2, fit.1, test='LRT') - #shape.p <- anova(fit.4, fit.3, test='LRT') - - anovas <- list(global.p = global.p, level.p = level.p) - p <- rbind(global.p[['Pr(>Chi)']][2], level.p[['Pr(>Chi)']][2]) # , shape.p, - } - else { - lrt.uni <- function(data=sir.spline, spline.var=spline[1], print=print, knots=knts, knum = 1) { - if (is.na(spline.var)) return (NULL) - data <- data.table(data) - knots <- knots[[knum]] - fit0 <- glm(observed ~ get(print)+Ns(get(spline.var), knots = knots), offset=log(expected), family=poisson(log), data = data[expected>0]) - fit1 <- glm(observed ~ Ns(get(spline.var), knots = knots), offset=log(expected), family=poisson(log), data = data[expected>0]) - fit2 <- glm(observed ~ get(print)*Ns(get(spline.var), knots = knots), offset=log(expected), family=poisson(log), data = data[expected>0]) - anova(fit2,fit1,fit0, test='Chisq') # [['Pr(>Chi)']][2] - } - - var1.p <- lrt.uni(spline.var = spline[1], print=print, knots=knts, knum = 1) - var2.p <- lrt.uni(spline.var = spline[2], print=print, knots=knts, knum = 2) - var3.p <- lrt.uni(spline.var = spline[3], print=print, knots=knts, knum = 3) - - p <- list(spline.a = var1.p[['Pr(>Chi)']][2], - spline.b = var2.p[['Pr(>Chi)']][2], - spline.c = var3.p[['Pr(>Chi)']][2]) - anovas <- list(spline.a = var1.p, spline.b = var2.p, spline.c = var3.p) - } - - output <- list( spline.est.A = spline.est.A, - spline.est.B = spline.est.B, - spline.est.C = spline.est.C, - spline.seq.A = spline.seq.A, - spline.seq.B = spline.seq.B, - spline.seq.C = spline.seq.C, - adjust = adjust, - print = print, - spline = spline, - anovas = anovas, - knots = knts, - spline.dependent = dependent.splines, - p.values = p) - output -} - -# input data and argument list. replaces print in upper environment with name a vector. -data_list <- function( data, arg.list, env ) { - if(missing(env)){ - arg.list <- substitute(arg.list) - env <- parent.frame() - } - d <- data.table(data) - - l <- eval(arg.list, envir = d, enclos = parent.frame()) - - if( is.list( l ) ) { - n <- intersect(names(l), names(d)) - if(length(n)>0){ - d[,(n) := NULL] - } - # if(is.null(names(l))) { - # v <- 1:length(l) - # setnames(l, v, paste0('V', v)) - # } - l <- as.data.table(l) - l <- data.table(l) - assign('print', colnames(l), envir = env) # set names to parent environment - if( ncol(d) > 0) { - l <- data.table(d, l) - } - return(l) - } else { - return(data) - } -} - -#' @export -coef.sir <- function(object, ...) { - factors <- attr(object, 'sir.meta')$print - - q <- paste("paste(",paste(factors,collapse=","),", sep = ':')") - q <- parse(text=q) - n <- object[,eval(q)] - - res <- object$sir - attr(res, 'names') <- n - - res -} - - - - -#' @export -confint.sir <- function(object, parm, level = 0.95, conf.type = 'profile', - test.type = 'homogeneity', ...) { - - meta <- attr(object, 'sir.meta') - object <- copy(object) - object <- sir_est(table = object, - print = meta$print, - adjust = NULL, - conf.type = conf.type, - test.type = test.type, - conf.level = level, - EAR = FALSE) - object <- object$table - q <- paste("paste(",paste(meta$print,collapse=","),", sep = ':')") - q <- parse(text=q) - n <- object[,eval(q)] - - res <- cbind(object$sir.lo, object$sir.hi) - - rownames(res) <- n - colnames(res) <- paste( c( (1-level)/2*100, (1 - (1-level)/2)*100), '%') - - res -} - - -#' @title Calculate SMR -#' @author Matti Rantanen -#' @description Calculate Standardized Mortality Ratios (SMRs) using -#' a single data set that includes -#' observed and expected cases and additionally person-years. -#' -#' @details These functions are intended to calculate SMRs from a single data set -#' that includes both observed and expected number of cases. For example utilizing the -#' argument \code{pop.haz} of the \code{\link{lexpand}}. -#' -#' \code{sir_lex} automatically exports the transition \code{fromXtoY} using the first -#' state in \code{lex.Str} as \code{0} and all other as \code{1}. No missing values -#' is allowed in observed, pop.haz or person-years. -#' -#' @param x Data set e.g. \code{aggre} or \code{Lexis} object -#' (see: \code{\link{lexpand}}) -#' @param obs Variable name of the observed cases in the data set -#' @param exp Variable name or expression for expected cases -#' @param pyrs Variable name for person-years (optional) -#' @param print Variables or expression to stratify the results -#' @param test.type Test for equal SIRs. Test available are 'homogeneity' and 'trend' -#' @param conf.level Level of type-I error in confidence intervals, default 0.05 is 95\% CI -#' @param conf.type select confidence interval type: (default=) `profile`, `wald`, `univariate` -#' @param subset a logical vector for subsetting data -#' -#' @seealso \code{\link{lexpand}} -#' \href{../doc/sir.html}{A SIR calculation vignette} -#' @family sir functions -#' -#' @return A sir object -#' -#' @examples -#' -#' \dontrun{ -#' BL <- list(fot = 0:5, per = c("2003-01-01","2008-01-01", "2013-01-01")) -#' -#' ## Aggregated data -#' x1 <- lexpand(sire, breaks = BL, status = status != 0, -#' birth = bi_date, entry = dg_date, exit = ex_date, -#' pophaz=popmort, -#' aggre=list(sex, period = per, surv.int = fot)) -#' sir_ag(x1, print = 'period') -#' -#' -#' # no aggreate or breaks -#' x2 <- lexpand(sire, status = status != 0, -#' birth = bi_date, entry = dg_date, exit = ex_date, -#' pophaz=popmort) -#' sir_lex(x2, breaks = BL, print = 'per') -#' } -#' -#' @import data.table -#' @import stats -#' @export -sir_exp <- function(x, obs, exp, pyrs=NULL, print = NULL, - conf.type = 'profile', test.type = 'homogeneity', - conf.level = 0.95, subset = NULL) { - - # subsetting - subset <- substitute(subset) - subset <- evalLogicalSubset(data = x, substiset = subset) - x <- x[subset,] - - # evalPopArg - obs <- substitute(obs) - c.obs <- evalPopArg(data = x, arg = obs) - obs <- names(c.obs) - - - print <- substitute(print) - c.pri <- evalPopArg(data = x, arg = print) - print <- names(c.pri) - - exp <- substitute(exp) - c.exp <- evalPopArg(data = x, arg = exp) - exp <- names(c.exp) - - pyrs <- substitute(pyrs) - c.pyr <- evalPopArg(data = x, arg = pyrs) - if(is.null(c.pyr)) c.pyr <- data.table(pyrs=0) - pyrs <- names(c.pyr) - - # collect data - x <- cbind(c.obs, c.pyr, c.exp) - if(any(is.na(x))) stop('Missing values in expected cases.') - if(!is.null(print)) x<- cbind(x, c.pri) - - express <- paste0('list(observed = sum(', obs, '), expected = sum(',exp,'), pyrs = sum(', pyrs,'))') - # aggregate - es <- parse(text = express) - y <- x[, eval(es), keyby = print] # keyby is must - - results <- sir_est( table = y, - print = print, - adjust = NULL, - conf.type = conf.type, - test.type = test.type, - conf.level = conf.level, - EAR = FALSE) - - #setDT(data) - if (!return_DT()) { - for (i in 1:2) { - if (!is.null(results[[i]])) { - setDFpe(results[[i]]) - } - } - } - - data <- copy(results[[2]]) - setattr(data, name = 'sir.meta', value = list(adjust = NULL, - print = print, - call = match.call(), - lrt.test= results$'lrt.test', - conf.type = results$'conf.type', - conf.level = conf.level, - lrt.test.type = results$'test.type', - pooled.sir = results[[1]])) - setattr(data, "class", c("sir", "data.table", "data.frame")) - return(data) -} - - - -#' Calculate SMRs from a split Lexis object -#' -#' @description \code{sir_lex} solves SMR from a \code{\link{Lexis}} object -#' calculated with \code{lexpand}. -#' -#' @param breaks a named list to split age group (age), period (per) or follow-up (fot). -#' @param ... pass arguments to \code{sir_exp} -#' -#' -#' @describeIn sir_exp -#' -#' @export - -sir_lex <- function(x, print = NULL, breaks = NULL, ... ) { - - ## R CMD CHECK appeasement - lex.dur <- NULL - - if(!inherits(x, 'Lexis')) { - stop('x has to be a Lexis object (see lexpand or Lexis)') - } - if(!"pop.haz" %in% names(x)) { - stop("Variable pop.haz not found in the data.") - } - - - # reformat date breaks - if(!is.null(breaks)) { - breaks <- lapply(breaks, function(x) { - if(is.character(x)) c(cal.yr(as.Date(x))) - else x - }) - } - - print <- substitute(print) - # copy to retain the attributes - x <- copy(x) - - # guess the first value - first_value <- lapply(c("lex.Cst", "lex.Xst"), function(var) { - if (is.factor(x[[var]])) levels(x[[var]]) else sort(unique(x[[var]])) - }) - first_value <- unique(unlist(first_value))[1] - - col <- x$lex.Xst - set(x, j = "lex.Cst", value = 0L) - set(x, j = "lex.Xst", value = ifelse(col == first_value, 0L, 1L)) - - if(!is.null(breaks)) { - x <- splitMulti(x, breaks = breaks) - } - - a <- copy(attr(x, "time.scales")) - a <- a[!vapply(get_breaks(x), is.null, logical(1))] - x[, d.exp := pop.haz*lex.dur] - - TF <- environment() - - if(any(is.na(x[,d.exp]))) stop('Missing values in either pop.haz or lex.dur.') - x <- aggre(x, by = TF$a, sum.values = 'd.exp') - if(!'from0to1' %in% names(x)) { - stop('Could not find any transitions between states in lexis') - } - x <- sir_exp(x = x, obs = 'from0to1', print = print, exp = 'd.exp', pyrs = 'pyrs', ...) - # override the match.call from sir_exp - attr(x, 'sir.meta')$call <- match.call() - return(x) -} - - -#' SMR method for an \code{aggre} object. -#' -#' @description \code{sir_ag} solves SMR from a \code{\link{aggre}} object -#' calculated using \code{\link{lexpand}}. -#' -#' @describeIn sir_exp -#' -#' @export - -sir_ag <- function(x, obs = 'from0to1', print = attr(x, 'aggre.meta')$by, exp = 'd.exp', pyrs = 'pyrs', ... ) { - - if(!inherits(x, 'aggre')) { - stop('x should be an aggre object (see lexpand or sir_lex)') - } - obs <- substitute(obs) - print <- substitute(print) - - x <- copy(x) - x <- sir_exp(x = x, obs = obs, print = print, exp = 'd.exp', pyrs = 'pyrs', ...) # original - attr(x, 'sir.meta')$call <- match.call() # override the call from sir_exp - x -} - - - -globalVariables(c('observed','expected','p_adj','p_value','temp','coh.observations','coh.personyears', - 'd.exp', 'lower', 'pop.haz', 'sir.hi','sir.lo','upper')) - +#' @title Calculate SIR or SMR +#' @author Matti Rantanen, Joonas Miettinen +#' @description Poisson modelled standardised incidence or mortality ratios (SIRs / SMRs) i.e. +#' indirect method for calculating standardised rates. SIR is a ratio of observed and expected cases. +#' Expected cases are derived by multiplying the strata-specific population rate with the +#' corresponding person-years of the cohort. +#' +#' @details \code{sir} is a comprehensive tool for modelling SIRs/SMRs with flexible +#' options to adjust and print SIRs, test homogeneity and utilize +#' multi-state data. The cohort data and the variable names for observation +#' counts and person-years are required. +#' The reference data is optional, since the cohort data +#' can be stratified (\code{print}) and compared to total. +#' +#' +#' \strong{Adjust and print} +#' +#' A SIR can be adjusted or standardised using the covariates found in both \code{coh.data} and \code{ref.data}. +#' Variable to adjust are given in \code{adjust}. +#' Variable names needs to match in both \code{coh.data} and \code{ref.data}. +#' Typical variables to adjust by are gender, age group and calendar period. +#' +#' \code{print} is used to stratify the SIR output. In other words, the variables +#' assigned to \code{print} are the covariates of the Poisson model. +#' Variable levels are treated as categorical. +#' Variables can be assigned in both \code{print} and \code{adjust}. +#' This means the output it adjusted and printed by these variables. +#' +#' \code{print} can also be a list of expressions. This enables changing variable +#' names or transforming variables with functions such as \code{cut} and \code{round}. +#' For example, the existing variables \code{agegroup} and \code{year} could be +#' transformed to new levels using \code{cut} by +#' +#' \code{print = list( age.category = cut(agegroup, breaks = c(0,50,75,100)), +#' year.cat = cut(year, seq(1950,2010,20)))} +#' +#' +#' \strong{ref.rate or ref.obs & ref.pyrs} +#' +#' The population rate variable can be given to the \code{ref.rate} parameter. +#' That is, when using e.g. the \code{popmort} or a comparable data file, one may +#' supply \code{ref.rate} instead of \code{ref.obs} and \code{ref.pyrs}, which +#' will be ignored if \code{ref.rate} is supplied. +#' +#' +#' Note that if all the stratifying variables in +#' \code{ref.data} are not listed in \code{adjust}, +#' or when the categories are otherwise combined, +#' the (unweighted) mean of rates is used for computing expected cases. +#' This might incur a small bias in comparison to when exact numbers of observations +#' and person-years are available. +#' +#' +#' +#' \strong{mstate} +#' +#' E.g. using \code{lexpand} it's possible to compute counts for several outcomes +#' so that the population at risk is same for each +#' outcome such as a certain kind of cancer. +#' The transition counts are in wide data format, +#' and the relevant columns can be supplied to \code{sir} +#' in a vector via the \code{coh.obs} argument. +#' The name of the corresponding new column in \code{ref.data} is given in +#' \code{mstate}. It's recommended to include the \code{mstate} variable in \code{adjust}, +#' so the corresponding information should also be available in \code{ref.data}. +#' More examples in sir-vignette. +#' +#' This approach is analogous to where SIRs are calculated separately their +#' own function calls. +#' +#' +#' \strong{Other parameters} +#' +#' \code{univariate} confidence intervals are calculated using exact +#' Poisson intervals (\code{poisson.ci}). The options \code{profile} and \code{wald} are +#' is based on a Poisson regression model: profile-likelihood confidence intervals +#' or Wald's normal-approximation. P-value is Poisson model based \code{conf.type} +#' or calculated using the method described by Breslow and Day. Function automatically +#' switches to another \code{conf.type} if calculation is not possible with a message. +#' Usually model fit fails if there is print stratum with zero expected values. +#' +#' +#' The LRT p-value tests the levels of \code{print}. The test can be either +#' \code{"homogeneity"}, a likelihood ratio test where the model variables defined in +#' \code{print} (factor) is compared to the constant model. +#' Option \code{"trend"} tests if the linear trend of the continuous variable in +#' \code{print} is significant (using model comparison). +#' +#' +#' \strong{EAR: Excess Absolute Risk} +#' +#' Excess Absolute Risk is a simple way to quantify the absolute difference between cohort risk and +#' population risk. +#' Make sure that the person-years are calculated accordingly before using EAR. (when using mstate) +#' +#' Formula for EAR: +#' \deqn{EAR = \frac{observed - expected}{person years} \times 1000.}{EAR = (obs - exp)/pyrs * 1000.} +#' +#' \strong{Data format} +#' +#' The data should be given in tabulated format. That is the number of observations +#' and person-years are represented for each stratum. +#' Note that also individual data is allowed as long as each observations, +#' person-years, and print and adjust variables are presented in columns. +#' The extra variables and levels are reduced automatically before estimating SIRs. +#' Example of data format: +#' +#' \tabular{rrrrr}{ +#' sex \tab age \tab period \tab obs \tab pyrs \cr +#' 0 \tab 1 \tab 2010 \tab 0 \tab 390 \cr +#' 0 \tab 2 \tab 2010 \tab 5 \tab 385 \cr +#' 1 \tab 1 \tab 2010 \tab 3 \tab 308 \cr +#' 1 \tab 2 \tab 2010 \tab 12 \tab 315 +#' } +#' +#' +#' @param coh.data aggregated cohort data, see e.g. \code{\link{lexpand}} +#' @param coh.pyrs variable name for person years in cohort data; quoted or unquoted +#' @param coh.obs variable name for observed cases; quoted or unquoted. A vector when using \code{mstata}. +#' @param ref.data population data. Can be left NULL if \code{coh.data} is stratified in \code{print}. +#' @param ref.rate population rate variable (cases/person-years). Overwrites arguments +#' \code{ref.pyrs} and \code{ref.obs}; quoted or unquoted +#' @param ref.pyrs variable name for person-years in population data; quoted or unquoted +#' @param ref.obs variable name for observed cases; quoted or unquoted +#' @param subset logical condition to select data from \code{coh.data} before any computations +#' @param adjust variable names for adjusting without stratifying output; quoted vector or unquoted list +#' @param print variable names to stratify results; quoted vector or unquoted named list with functions +#' @param mstate set column names for cause specific observations; quoted or unquoted. Relevant only +#' when \code{coh.obs} length is two or more. See details. +#' @param test.type Test for equal SIRs. Test available are 'homogeneity' and 'trend'. +#' @param conf.type Confidence interval type: 'profile'(=default), 'wald' or 'univariate'. +#' @param conf.level Level of type-I error in confidence intervals, default 0.05 is 95\% CI. +#' @param EAR logical; TRUE calculates Excess Absolute Risks for univariate SIRs. +#' (see details) + +#' +#' @examples +#' data(popmort) +#' data(sire) +#' c <- lexpand( sire, status = status, birth = bi_date, exit = ex_date, entry = dg_date, +#' breaks = list(per = 1950:2013, age = 1:100, fot = c(0,10,20,Inf)), +#' aggre = list(fot, agegroup = age, year = per, sex) ) +#' ## SMR due other causes: status = 2 +#' se <- sir( coh.data = c, coh.obs = 'from0to2', coh.pyrs = 'pyrs', +#' ref.data = popmort, ref.rate = 'haz', +#' adjust = c('agegroup', 'year', 'sex'), print = 'fot') +#' se +#' ## for examples see: vignette('sir') +#' +#' +#' @seealso \code{\link{lexpand}} +#' \href{../doc/sir.html}{A SIR calculation vignette} +#' @family sir functions +#' @family main functions +#' +#' @return A sir-object that is a \code{data.table} with meta information in the attributes. +#' +#' @export +#' +#' @import data.table +#' @import stats + + + + +sir <- function( coh.data, + coh.obs, + coh.pyrs, + ref.data = NULL, + ref.obs = NULL, + ref.pyrs = NULL, ref.rate = NULL, + subset = NULL, + print = NULL, + adjust = NULL, + mstate = NULL, + test.type = 'homogeneity', + conf.type = 'profile', + conf.level = 0.95, + EAR = FALSE){ + + coh.data <- data.table(coh.data) + + ## subsetting--------------------------------------------------------------- + ## no copy taken of data! + subset <- substitute(subset) + subset <- evalLogicalSubset(data = coh.data, substiset = subset) + coh.data <- coh.data[subset,] + + + # print list -------------------------------------------------------------- + + # env1 <- environment() # set environment where to assign new print + # coh.data <- data_list(data = coh.data, arg.list = substitute(print), env = env1) + + mstate <- as.character(substitute(mstate)) + if(length(mstate) == 0) { + mstate <- NULL + } + if(!is.null(mstate)) { + coh.data[,(mstate) := 0L] + } + + # evalPopArg + coh.obs <- substitute(coh.obs) + c.obs <- evalPopArg(data = coh.data, arg = coh.obs) + coh.obs <- names(c.obs) + + coh.pyrs <- substitute(coh.pyrs) + c.pyr <- evalPopArg(data = coh.data, arg = coh.pyrs) + coh.pyrs <- names(c.pyr) + + print <- substitute(print) + c.pri <- evalPopArg(data = coh.data, arg = print) + print <- names(c.pri) + + adjust <- substitute(adjust) + c.adj <- evalPopArg(data = coh.data, arg = adjust) + adjust <- names(c.adj) + + # collect data + coh.data <- cbind(c.obs, c.pyr) + if(!is.null(print)) coh.data <- cbind(coh.data, c.pri) + if(!is.null(adjust)) coh.data <- cbind(coh.data, c.adj) + + if( !is.null(ref.data) ){ + ref.obs <- as.character(substitute(ref.obs)) + ref.pyrs <- as.character(substitute(ref.pyrs)) + ref.rate <- as.character(substitute(ref.rate)) + + if (length(ref.obs) == 0) ref.obs <- NULL + if (length(ref.pyrs) == 0) ref.pyrs <- NULL + if (length(ref.rate) == 0) ref.rate <- NULL + } + + + # print(coh.data) + + st <- sir_table( coh.data = coh.data, + coh.obs = coh.obs, + coh.pyrs = coh.pyrs, + ref.data = ref.data, + ref.obs = ref.obs, + ref.pyrs = ref.pyrs, + ref.rate = ref.rate, + print = print, + adjust = adjust, + mstate = mstate) + + results <- sir_est( table = st, + print = print, + adjust = adjust, + conf.type = conf.type, + test.type = test.type, + conf.level = conf.level, + EAR = EAR) + + ## final touch --------------------------------------------------------------- + + + #setDT(data) + if (!return_DT()) { + for (i in 1:3) { + if (!is.null(results[[i]])) { + setDFpe(results[[i]]) + } + } + } + + data <- copy(results[[2]]) + setattr(data, name = 'sir.meta', value = list(adjust = adjust, + print = print, + call = match.call(), + lrt.test= results$'lrt.test', + conf.type = results$'conf.type', + conf.level = conf.level, + lrt.test.type = results$'test.type', + pooled.sir = results[[1]])) + setattr(data, "class", c("sir", "data.table", "data.frame")) + return(data) +} + + +#' @title Estimate splines for SIR or SMR +#' @author Matti Rantanen, Joonas Miettinen +#' +#' @description Splines for standardised incidence or mortality ratio. A useful +#' tool to e.g. check whether a constant SIR can be assumed for all calendar periods, +#' age groups or follow-up intervals. Splines can be fitted for these time dimensions +#' separately or in the same model. +#' +#' @param coh.data cohort data with observations and at risk time variables +#' @param coh.pyrs variable name for person-years in cohort data +#' @param coh.obs variable name for observed cases +#' @param ref.data aggregated population data +#' @param ref.rate population rate observed/expected. This overwrites the parameters +#' \code{ref.pyrs} and \code{ref.obs}. +#' @param ref.pyrs variable name for person-years in population data +#' @param ref.obs variable name for observed cases +#' @param subset logical condition to subset \code{coh.data} before any computations +#' @param adjust variable names for adjusting the expected cases +#' @param print variable names for which to estimate SIRs/SMRs and +#' associated splines separately +#' @param mstate set column names for cause specific observations. Relevant only +#' when coh.obs length is two or more. See help for \code{sir}. +#' @param spline variable name(s) for the splines +#' @param knots number knots (vector), pre-defined knots (list of vectors) or for optimal number of knots left NULL +#' @param dependent.splines logical; if TRUE, all splines are fitted in same model. +#' @param reference.points fixed reference values for rate ratios. If left \code{NULL} +#' the smallest value is the reference point (where SIR = 1). +#' Ignored if \code{dependent.splines = FALSE} +#' +#' +#' @details +#' +#' See \code{\link{sir}} for help on SIR/SMR estimation in general; usage of splines +#' is discussed below. +#' +#' \strong{The spline variables} +#' +#' The model can include one, two or three splines variables. +#' Variables can be included in the same model selecting \code{dependent.splines = TRUE} +#' and SIR ratios are calculated (first one is the SIR, others SIR ratios). +#' Reference points vector can be set via \code{reference.points} +#' where first element of the vector is the reference point for first ratio. +#' +#' Variable(s) to fit splines are given as a vector in argument \code{spline}. +#' Order will affect the results. +#' +#' +#' \strong{dependent.splines} +#' +#' By default dependent.splines is FALSE and all splines are fitted in separate models. +#' If TRUE, the first variable in \code{spline} is a function of a SIR and other(s) are ratios. +#' +#' \strong{knots} +#' +#' There are three options to set knots to splines: +#' +#' Set the number of knots for each spline variable with a \strong{vector}. +#' The knots are automatically placed to the quantiles of observed cases in cohort data. +#' The first and last knots are always the maximum and minimum values, so knot +#' value needs to be at least two. +#' +#' Predefined knot places can be set with a \strong{list} of vectors. +#' The vector for each spline in the list specifies the knot places. The lowest +#' and the largest values are the boundary knots and these should be checked beforehand. +#' +#' If \code{knots} is left \strong{NULL}, the model searches the optimal number +#' of knots by model AIC by fitting models iteratively from 2 to 15 knots and +#' the one with smallest AIC is selected. +#' If \code{dependent.splines = TRUE}, the number of knots is searched by fitting each spline +#' variable separately. +#' +#' +#' \strong{print} +#' +#' Splines can be stratified by the levels of variable given in \code{print}. If +#' \code{print} is a vector, only the first variable is accounted for. The knots +#' are placed globally for all levels of \code{print}. This also ensures that the likelihood +#' ratio test is valid. +#' Splines are also fitted independently for each level of \code{print}. +#' This allows for searching interactions, e.g. by fitting spline for period +#' (\code{splines='period'}) for each age group (\code{print = 'agegroup'}). +#' +#' +#' \strong{p-values} +#' +#' The output p-value is a test of whether the splines are equal (homogenous) +#' at different levels of \code{print}. +#' The test is based on the likelihood ratio test, where the full model +#' includes \code{print} and is +#' compared to a null model without it. +#' When \code{(dependent.splines = TRUE)} the p-value returned is a global p-value. +#' Otherwise the p-value is spline-specific. +#' +#' +#' @return A list of data.frames and vectors. +#' Three spline estimates are named as \code{spline.est.A/B/C} and the corresponding values +#' in \code{spline.seq.A/B/C} for manual plotting +#' +#' +#' @seealso \code{\link{splitMulti}} +#' \href{../doc/sir.html}{A SIR calculation vignette} +#' @family sir functions +#' @family main functions +#' +#' @export sirspline +#' @import data.table +#' @import splines +#' @import stats +#' +#' @examples \dontrun{ +#' ## for examples see: vignette('sir') +#' } + +sirspline <- function( coh.data, + coh.obs, + coh.pyrs, + ref.data = NULL, + ref.obs = NULL, + ref.pyrs = NULL, + ref.rate = NULL, + subset = NULL, + print = NULL, + adjust = NULL, + mstate = NULL, + spline, + knots = NULL, + reference.points = NULL, + dependent.splines = TRUE){ + + coh.data <- data.table(coh.data) + + ## subsetting----------------------------------------------------------------- + ## no copy taken of data! + subset <- substitute(subset) + subset <- evalLogicalSubset(data = coh.data, substiset = subset) + coh.data <- coh.data[subset,] + + # print list -------------------------------------------------------------- + + env1 <- environment() + coh.data <- data_list(data = coh.data, arg.list = substitute(print), env = env1) + + mstate <- as.character(substitute(mstate)) + if(length(mstate) == 0) { + mstate <- NULL + } + if(!is.null(mstate)) { + coh.data[,(mstate) := 0L] + } + + # evalPopArg + + spline <- substitute(spline) + c.spl <- evalPopArg(data = coh.data, arg = spline) + spline <- names(c.spl) + + coh.obs <- substitute(coh.obs) + c.obs <- evalPopArg(data = coh.data, arg = coh.obs) + coh.obs <- names(c.obs) + + coh.pyrs <- substitute(coh.pyrs) + c.pyr <- evalPopArg(data = coh.data, arg = coh.pyrs) + coh.pyrs <- names(c.pyr) + + print <- substitute(print) + c.pri <- evalPopArg(data = coh.data, arg = print) + print <- names(c.pri) + + adjust <- substitute(adjust) + c.adj <- evalPopArg(data = coh.data, arg = adjust) + adjust <- names(c.adj) + + # collect data + coh.data <- cbind(c.obs, c.pyr, c.spl) + if(!is.null(print)) { + coh.data <- cbind(coh.data, c.pri[, print[!print %in% spline], with=FALSE]) + } + if(!is.null(adjust)) { + coh.data <- cbind(coh.data, c.adj[, adjust[!adjust %in% spline], with=FALSE]) + } + + if( !is.null(ref.data) ){ + ref.obs <- as.character(substitute(ref.obs)) + ref.pyrs <- as.character(substitute(ref.pyrs)) + ref.rate <- as.character(substitute(ref.rate)) + + if (length(ref.obs) == 0) ref.obs <- NULL + if (length(ref.pyrs) == 0) ref.pyrs <- NULL + if (length(ref.rate) == 0) ref.rate <- NULL + } + + st <- sir_table( coh.data = coh.data, + coh.obs = coh.obs, + coh.pyrs = coh.pyrs, + ref.data = ref.data, + ref.obs = ref.obs, + ref.pyrs = ref.pyrs, ref.rate = ref.rate, + print = print, + adjust = adjust, + mstate = mstate, + spline = spline) + + results <- sir_spline( table = st, + print = print, + adjust = adjust, + spline = spline, + knots = knots, + reference.points = reference.points, + dependent.splines = dependent.splines) + + setclass(results, c('sirspline', 'pe', class(results))) + return(results) +} + + + + + + +# Input: two data.table:s +# output: one data.table including rates +#' @import stats +#' @import data.table +sir_table <- function( coh.data, + coh.obs, + coh.pyrs, + ref.data = NULL, + ref.obs = NULL, + ref.pyrs = NULL, + ref.rate = NULL, + print = NULL, + adjust = NULL, + spline = NULL, + mstate = NULL) { + + + # initial checks ------------------------------------------------- + + if(is.null(ref.data)) { + if(is.null(print)){ + stop('Both ref.data and print cannot be NULL.') + } + ref.data <- data.table(coh.data) + ref.obs <- coh.obs + ref.pyrs <- coh.pyrs + } + + coh.data <- data.table(coh.data) + ref.data <- data.table(ref.data) + + vl <- unique( c(coh.pyrs, coh.obs, adjust, print) ) + if( !is.null(mstate) ) { + vl <- vl[which( vl != mstate )] + } + all_names_present(coh.data, vl ) + + if ( !is.null(ref.pyrs) & !is.null(ref.obs) ) { + all_names_present(ref.data, c(ref.pyrs, ref.obs, adjust)) + } + + # Melt lexpand data ------------------------------------------------------- + + if( length(coh.obs) > 1 ) { + if( is.null(mstate) ){ + stop('coh.obs length is > 1. Set variable name for mstate.') + } + if( !mstate %in% names(ref.data) ){ + warning('mstate variable name does not match names in ref.data.') + } + + aggre <- unique(c(adjust, print, spline, coh.pyrs)) + aggre <- aggre[which(aggre != mstate)] + + coh.data <- melt( data = coh.data, id.vars = aggre, measure.vars = coh.obs, + value.name = 'coh.observations', + variable.name = mstate, variable.factor = FALSE) + coh.obs <- 'coh.observations' + + # parse Y name form string 'formXtoY' + q <- quote( + robust_values(substr(get(mstate), + start = regexpr( pattern = 'to', text = get(mstate) ) + 2, + stop = nchar(x = get(mstate) ))) + ) + coh.data[,(mstate) := eval(q) ] + + if( !(mstate %in% adjust)) { + warning('Consider including mstate variable also in adjust. See help(sir) for details.') + } + } + + # prepare data steps, reduce dimensions ----------------------------------- + + setnames(coh.data, c(coh.obs, coh.pyrs), c('coh.observations','coh.personyears')) + + + coh.data <- expr.by.cj(data = coh.data, + by.vars = unique( sort(c(adjust, print, spline)) ), + expr = list(coh.observations = sum(coh.observations), + coh.personyears = sum(coh.personyears))) + #coh.data <- na2zero(coh.data) + #coh.data <- na.omit(coh.data) + + coh.data[is.na(coh.observations), coh.observations := 0] + coh.data[is.na(coh.personyears), coh.personyears := 0] + coh.data <- na.omit(coh.data) + + # rates + if( !is.null(ref.rate) ){ + setnames(ref.data, ref.rate, 'ref.rate') + ref.data <- expr.by.cj(data = ref.data, by.vars = c(adjust), + expr = list(ref.rate = mean(ref.rate))) + } else { + setnames(ref.data, c(ref.obs, ref.pyrs), c('ref.obs','ref.pyrs')) + ref.data <- expr.by.cj(data = ref.data, by.vars = c(adjust), + expr = list(ref.obs = sum(ref.obs), + ref.pyrs= sum(ref.pyrs))) + ref.data[, ref.rate := ref.obs / ref.pyrs ] + } + + # Merge + sir.table <- merge(coh.data, ref.data, by=c(adjust), all.x=TRUE) + sir.table[, expected := ref.rate * coh.personyears] + sir.table <- na2zero(sir.table) + + if ( !is.null(print) | !is.null(spline)){ + sir.table <- sir.table[ ,list(observed = sum(coh.observations), + expected = sum(expected), + pyrs = sum(coh.personyears)), + by = c(unique(c(print, spline)))] + setkeyv(sir.table, c(print, spline)) + } + else { + sir.table <- sir.table[ ,list(observed = sum(coh.observations), + expected = sum(expected), + pyrs = sum(coh.personyears))] + } + return(sir.table) +} + + + + +# Input: sir.table +# Output: list of data.tables and values +sir_est <- function( table, + print = NULL, + adjust = NULL, + EAR = FALSE, + test.type = 'homogeneity', + conf.level = 0.95, + conf.type = 'profile') { + pyrs <- NULL ## APPEASE R CMD CHECK + setDT(table) + + if(!is.numeric(conf.level) | conf.level > 1) { + stop('Confidence level must be a numeric value between 0-1') + } + # function to SIR p-value + chi.p <- function(o, e) { + pchisq( ( (abs(o - e) - 0.5)^2)/e, df=1, lower.tail=FALSE) + } + + # total sir + combined <- data.table(table)[,list(observed = sum(observed), + expected = sum(expected), + pyrs = sum(pyrs))] + combined[ ,':='(sir = observed/expected, + sir.lo = poisson.ci(observed, expected, conf.level=conf.level)[,4], + sir.hi = poisson.ci(observed, expected, conf.level=conf.level)[,5], + p_value = chi.p(observed, expected))] + + # Poisson regression ------------------------------------------------------ + + # write model formula + fa <- a <- NULL + sir.formula <- paste('observed ~ 1') + if(!is.null(print)){ + fa <- rev(print) # fa <- print + + # drop variables with only one value + u <- c(t(table[, lapply(.SD, uniqueN), .SDcols = fa])) + if (length(u[u==1]) > 0){ + message('Variable "', paste(fa[which(u==1)], collapse = '","'),'" (has only one level) removed from model.') + fa <- fa[-which(u==1)] + } + if(length(fa)>0){ + # model formula + a <- paste0('as.factor(',paste( fa, collapse = '):as.factor('),')') + sir.formula <- paste('observed ~ 0 +', a) + } + } + # fit model if possible ----------------------------------------------------- + + fit <- tryCatch(do.call("glm", list(formula = terms(as.formula(sir.formula), keep.order = FALSE), + offset = log(table[,expected]), + data = table, family = poisson(log))), + error=function(f) NULL ) + + if(!is.null(fit)) eg <- expand.grid(fit$xlevels) # for further testing + + + # LRT test (homogeneity or trend) -------------------------------------------- + + test.type <- match.arg(test.type, c('homogeneity','trend')) + + lrt_sig <- NULL + if( sir.formula != 'observed ~ 1' & !is.null(fit) ) { + if (test.type == 'homogeneity') covariates <- a + if (test.type == 'trend') covariates <- paste(print, collapse=' + ') + + fit_full <- tryCatch( + do.call("glm", list(formula = terms(as.formula( paste0('observed ~ 1 + ', a) )), + offset = log(table[,expected]), + data = table, family=poisson(log))), + error=function(f) NULL ) + + fit_null <- tryCatch( + do.call("glm", list(formula = terms(as.formula('observed ~ 1') ), + offset = log(table[,expected]), + data = table, family=poisson(log))), + error=function(f) NULL ) + + if (!is.null(fit_full)){ + lrt <- anova(fit_full, fit_null, test = 'Chisq') + lrt_sig <- lrt[['Pr(>Chi)']][2] + } + } + + # confidence intervals ---------------------------------------------------- + + conf.type <- match.arg(conf.type, c('wald','profile','univariate')) + ci.info <- NULL + ci <- NULL + + if (is.null(fit) & conf.type %in% c('wald','profile')) { + conf.type <- 'univariate' + ci.info <- 'Model fitting failed. Univariate confidence intervals selected.' + if(any(table$expected == 0)) { + ci.info <- paste(ci.info, '(zero values in expected)') + } + } + + if (conf.type == 'profile') { + + confint_glm <- function(object, parm, level = 0.95, trace = FALSE, ...) { + pnames <- names(coef(object)) + if (missing(parm)) { + parm <- seq_along(pnames) + } + else if (is.character(parm)) { + parm <- match(parm, pnames, nomatch = 0L) + } + object <- profile(object, which = parm, alpha = (1 - level)/4, trace = trace) + confint(object, parm = parm, level = level, trace = trace, ...) + } + + ci <- suppressMessages( suppressWarnings( + tryCatch(exp(confint_glm(fit, level=conf.level)), error=function(e) NULL ) + )) + if(!is.null(ci)) { + ci <- as.data.table(ci) + if (is.null(print) | length(fa)==0) ci <- data.table(t(ci)) # transpose if only one row + } else { + conf.type <- 'wald' + ci.info <- 'Could not solve profile-likelihood. Wald confidence intervals selected.' + } + } + + if (conf.type == 'wald') { + ci <- data.table( exp(confint.default(fit)) ) + } + + if(conf.type == 'univariate') { + ci <- data.table(poisson.ci(table$observed, table$expected, conf.level = conf.level))[,.(lower, upper)] + pv <- chi.p(table$observed, table$expected) + } else { + pv <- as.vector(summary(fit)$coef[, "Pr(>|z|)"]) + } + if(!is.null(ci.info)) message(ci.info) + + # collect results ----------------------------------------------------- + + setnames(ci, 1:2, c('sir.lo','sir.hi')) + + table[, ':=' ( sir = observed/expected, + sir.lo = ci[, sir.lo], + sir.hi = ci[, sir.hi], + p_value = round(pv,5))] + + + # Round results ----------------------------------------------------------- + + cols1 <- c('sir','sir.lo','sir.hi','expected','pyrs') + + table[,(cols1) := lapply(.SD, round, digits=4), .SDcols=cols1] + combined[,(cols1) := lapply(.SD, round, digits=4), .SDcols=cols1] + + + # tests ----------------------------------- + + if (table[!is.na(sir) & (sir < sir.lo | sir > sir.hi), .N] > 0) { + warning('There is something wrong with confidence intervals') + } + if (table[!is.na(sir.lo) & !is.na(sir.hi)][sir.lo > sir.hi, .N] > 0) { + warning('CIs might be incorrect') + } + + if(!is.null(fit) & length(fa)>0) { + # pseudo test if the modelled confidence intervals are merged correctly: + t1 <- copy(table)[,lapply(.SD, factor),.SDcols = fa] + if(any(t1 != data.table(eg))) { + message('CIs levels might not match. Contact the package maintainer and use univariate CIs.') + } + } + + # EAR ----------------------------------------------------------------- + if (EAR) { + table[,EAR := round((observed - expected)/pyrs * 1000, 3)] + } + + + results <- list(total = combined, + table = table, + adjusted = adjust, + lrt.test = lrt_sig, + test.type = test.type, + conf.type = conf.type, + ci.info = ci.info) + return(results) +} + + +#' @export +getCall.sir <- function (x, ...) { + attributes(x)$sir.meta$call +} + + +# Input: sir.table +# Output: estimates and sequences for plotting splines +#' @import splines +#' @import data.table +#' @import stats +sir_spline <- function( table, + print = NULL, + adjust = NULL, + spline, + knots = NULL, + reference.points = NULL, + dependent.splines = TRUE){ + knts <- + spline.seq.A <- + spline.seq.B <- + spline.seq.C <- + spline.est.A <- + spline.est.B <- + spline.est.C <- NULL + + if (!is.null(knots) & length(knots) != length(spline) ) { + stop('Arguments spline and knots has to be same length.') + } + + + # Spline functions ------------------------------------------------------- + + # function to get spline seq + spline.seq <- function(data, spline.var=NULL) { + # palauttaa jotaina + if(is.na(spline.var)) { + return(NULL) + } + spline.seq <- seq( min( data[,get(spline.var)] ), + max( data[,get(spline.var)] ), length.out = 100) + return(spline.seq) + } + + # function to search optimal number of knots by AIC + spline.knots <- function(data, knots = NULL, spline.vars = NULL){ + # search optimal number of knots + if( is.null(knots) ) { + knts <- list() + for (jj in 1:length(spline.vars)) { + # reduce data to fit model + data0 <- data[,list(observed=sum(observed), expected = sum(expected)), by = eval(spline.vars[jj])] + data0 <- data0[expected > 0] + spline.fit <- glm(observed ~ 1, offset=log(expected), family=poisson(log), data = data0) + aic0 <- summary(spline.fit)[['aic']] + limit <- 20 + ii <- 2 + while( ii < limit ){ + tmp.knots <- ii + knts[jj] <- list( data0[ ,quantile( rep(get(spline.vars[jj]),observed), probs = seq(0,100,length.out = tmp.knots)/100)] ) + spline.fit <- glm(observed ~ Ns(get(spline.vars[jj]), knots = knts[[jj]]), offset=log(expected), family=poisson(log), data=data0) + aic0 <- c(aic0, summary(spline.fit)[['aic']]) + ii <- ii + 1 + } + tmp.knots <- which(aic0 == min(aic0))[1] + if(tmp.knots == 1) { + message(paste0('Null model better than spline in ', jj)) + tmp.knots <- 2 + } + knts[jj] <- list(data0[ ,quantile( rep(get(spline.vars[jj]),observed), probs = seq(0,100,length.out = tmp.knots)/100)]) + rm(tmp.knots) + } + knots <- unlist(lapply(knts, length)) + } + else { + # knot predefined + if( is.list(knots) ){ + knts <- knots + knots <- unlist(lapply(knots, length)) + } + # knot number predefined + else { + if( any(knots < 2) ) { + message('Min knots number set to 2.') + knots[knots < 2] <- 2 + } + knts <- list() + for(i in 1:length(knots)) { + knts[i] <- list( data[ ,quantile( rep(get(spline.vars[i]), observed), probs = seq(0,100,length.out = knots[i])/100)]) + } + } + } + names(knts) <- spline.vars + return(knts) + } + + # function to estimate 2-3 dim splines in same model + spline.estimates.dep <- function(sir.spline = sir.spline, + spline.seq.A = spline.seq.A, + spline.seq.B = spline.seq.B, + spline.seq.C = spline.seq.C, + reference.points = reference.points, + knts = knts + ){ + + if( all(!is.null(reference.points), (length(reference.points) + 1) != length(spline)) ){ + stop('Parameter reference.points length should be length of spline - 1.') + } + + + form <- 'Ns(get(spline[[1]]), kn=knts[[1]])' + nsA <- Ns( spline.seq.A, knots = knts[[1]]) + if ( length(spline) >= 2) { + form <- paste0(form, ' + Ns(get(spline[[2]]), kn=knts[[2]])') + nsB <- Ns( spline.seq.B, knots = knts[[2]]) + } + if ( length(spline) == 3) { + form <- paste0(form, ' + Ns(get(spline[[3]]), kn=knts[[3]])') + nsC <- Ns( spline.seq.C, knots = knts[[3]]) + } + + form <- paste0('observed ~ ', form) + spline.fit <- do.call("glm", list(formula = as.formula(form), + offset = log(sir.spline[expected > 0,expected]), + family = poisson, + data = sir.spline[expected>0])) + if( any( ci.exp(spline.fit)[,1] == 1) ){ + message("NA's in spline estimates.") + } + + aic <- summary(spline.fit)[['aic']] + + rf.C <- rf.B <- NA + # set assigned reference points or get minimum values + if( !is.null(reference.points) ) { + rf.B <- reference.points[1] + rf.C <- reference.points[2] + } + else { + rf.B <- min( sir.spline[,get(spline[2])] ) + if(!is.na(spline[3])) { + rf.C <- min( sir.spline[,get(spline[3])] ) + } + } + + if( !is.na(rf.B) ) { + B <- Ns( rep(rf.B, 100), knots = knts[[2]]) + if( findInterval(rf.B, range(sir.spline[,get(spline[2])])) != 1 ) { + message("WARNING: reference point 2 doesn't fall into spline variable interval") + } + } + + if( !is.na(rf.C) ){ + C <- Ns( rep(rf.C, 100), knots = knts[[3]]) + if( findInterval(rf.C, range(sir.spline[,get(spline[3])])) != 1) { + message("WARNING: reference point 3 doesn't fall into spline variable interval") + } + } + + # make subset of model parameters + if( !is.null(knts[2]) ) { + sub.B <- which( grepl('spline[[2]]', names(spline.fit$coefficients),fixed = TRUE) ) + } + if( !is.null(knts[3]) ) { + sub.C <- which( grepl('spline[[3]]', names(spline.fit$coefficients),fixed = TRUE) ) + } + if ( length(spline) == 2) { + spline.est.A <- ci.exp(spline.fit, ctr.mat = cbind(1, nsA, nsB)) + spline.est.B <- ci.exp(spline.fit, subset = sub.B, ctr.mat = nsB - B) + spline.est.C <- NULL + } + if ( length(spline) == 3) { + spline.est.A <- ci.exp(spline.fit, ctr.mat = cbind(1, nsA, nsB, nsC)) + spline.est.B <- ci.exp(spline.fit, subset= sub.B, ctr.mat = nsB - B) + spline.est.C <- ci.exp(spline.fit, subset= sub.C, ctr.mat = nsC - C) + } + list(a = spline.est.A, + b = spline.est.B, + c = spline.est.C) + } + + # function to estimate independet splines + spline.estimates.uni <- function(data, spline.var, spline.seq, knots, knum) { + if(is.na(spline.var)) return(NULL) + knots <- knots[[knum]] + data <- data[,list(observed=sum(observed), expected = sum(expected)), by = eval(spline.var)][expected > 0] + spline.uni <- glm(observed ~ Ns(get(spline.var), knots = knots), offset=log(expected), family=poisson(log), data = data) + nsx <- Ns( spline.seq, knots = knots) + spline.est <- ci.exp(spline.uni, ctr.mat = cbind(1, nsx)) + spline.est + } + + + + # Poisson regression Splines ------------------------------------------------- + + sir.spline <- data.table(table) + + # convert spline variables to numeric + temp.fun <- function(x){ + as.numeric(as.character(x)) + } + sir.spline[, (spline) := lapply(.SD, temp.fun), .SDcols = spline] + + + + # set knots + knts <- spline.knots(data=sir.spline, knots = knots, spline.vars = spline) + + # set sequences + spline.seq.A <- spline.seq(data=sir.spline, spline.var=spline[1]) + spline.seq.B <- spline.seq(data=sir.spline, spline.var=spline[2]) + spline.seq.C <- spline.seq(data=sir.spline, spline.var=spline[3]) + + if( length(spline) == 1 ) { + dependent.splines <- FALSE + } + + # convert print to factor + print <- print[1] + + # loop for each level of print: + if( !is.null(print) ) { + prnt.levels <- sir.spline[,unique( get(print) )] + sir.spline[,(print) := factor(get(print))] + } + else { + print <- 'temp' + sir.spline[,temp := 1] + prnt.levels <- 1 + } + + spline.est.A <- NULL + spline.est.B <- NULL + spline.est.C <- NULL + + for(i in prnt.levels){ + if( dependent.splines ) { + out <- spline.estimates.dep(sir.spline = sir.spline[get(print) == i], + spline.seq.A = spline.seq.A, + spline.seq.B = spline.seq.B, + spline.seq.C = spline.seq.C, + reference.points = reference.points, + knts = knts) + est.A <- out[['a']] + est.B <- out[['b']] + est.C <- out[['c']] + } + else{ + est.A <- spline.estimates.uni(data = sir.spline[get(print) == i], spline.var = spline[1], spline.seq = spline.seq.A, knots = knts, knum = 1) + est.B <- spline.estimates.uni(data = sir.spline[get(print) == i], spline.var = spline[2], spline.seq = spline.seq.B, knots = knts, knum = 2) + est.C <- spline.estimates.uni(data = sir.spline[get(print) == i], spline.var = spline[3], spline.seq = spline.seq.C, knots = knts, knum = 3) + } + + add_i <- function(est.x, i){ + if(is.null(est.x)) { + return(NULL) + } + cbind(i, data.frame(est.x)) + } + + + est.A <- add_i(est.A, i) + est.B <- add_i(est.B, i) + est.C <- add_i(est.C, i) + + spline.est.A <- rbind(spline.est.A, est.A) + spline.est.B <- rbind(spline.est.B, est.B) + spline.est.C <- rbind(spline.est.C, est.C) + } + + # get p-value and anova-table + anovas <- NULL + p <- NULL + if(dependent.splines) { + form.a <- 'Ns(get(spline[[1]]), kn=knts[[1]]) + Ns(get(spline[[2]]), kn=knts[[2]])' + form.b <- 'get(print):Ns(get(spline[[1]]), kn=knts[[1]]) + get(print):Ns(get(spline[[2]]), kn=knts[[2]])' + if ( length(spline) == 3) { + form.a <- paste0(form.a, ' + Ns(get(spline[[3]]), kn=knts[[3]])') + form.b <- paste0(form.b, ' + get(print):Ns(get(spline[[3]]), kn=knts[[3]])') + } + + fit.fun <- function( form.string ){ + do.call("glm", list(formula = as.formula( form.string ), + offset = log(sir.spline[expected > 0,expected]), + family = poisson, + data = sir.spline[expected>0])) + } + + fit.1 <- fit.fun( paste0('observed ~ ', form.a) ) + fit.2 <- fit.fun( paste0('observed ~ ', 'get(print)+', form.a)) + fit.3 <- fit.fun( paste0('observed ~ ', form.b)) + fit.4 <- fit.fun( paste0('observed ~ ', 'get(print)+', form.b) ) + + global.p<- anova(fit.4, fit.1, test='LRT') + level.p <- anova(fit.2, fit.1, test='LRT') + #shape.p <- anova(fit.4, fit.3, test='LRT') + + anovas <- list(global.p = global.p, level.p = level.p) + p <- rbind(global.p[['Pr(>Chi)']][2], level.p[['Pr(>Chi)']][2]) # , shape.p, + } + else { + lrt.uni <- function(data=sir.spline, spline.var=spline[1], print=print, knots=knts, knum = 1) { + if (is.na(spline.var)) return (NULL) + data <- data.table(data) + knots <- knots[[knum]] + fit0 <- glm(observed ~ get(print)+Ns(get(spline.var), knots = knots), offset=log(expected), family=poisson(log), data = data[expected>0]) + fit1 <- glm(observed ~ Ns(get(spline.var), knots = knots), offset=log(expected), family=poisson(log), data = data[expected>0]) + fit2 <- glm(observed ~ get(print)*Ns(get(spline.var), knots = knots), offset=log(expected), family=poisson(log), data = data[expected>0]) + anova(fit2,fit1,fit0, test='Chisq') # [['Pr(>Chi)']][2] + } + + var1.p <- lrt.uni(spline.var = spline[1], print=print, knots=knts, knum = 1) + var2.p <- lrt.uni(spline.var = spline[2], print=print, knots=knts, knum = 2) + var3.p <- lrt.uni(spline.var = spline[3], print=print, knots=knts, knum = 3) + + p <- list(spline.a = var1.p[['Pr(>Chi)']][2], + spline.b = var2.p[['Pr(>Chi)']][2], + spline.c = var3.p[['Pr(>Chi)']][2]) + anovas <- list(spline.a = var1.p, spline.b = var2.p, spline.c = var3.p) + } + + output <- list( spline.est.A = spline.est.A, + spline.est.B = spline.est.B, + spline.est.C = spline.est.C, + spline.seq.A = spline.seq.A, + spline.seq.B = spline.seq.B, + spline.seq.C = spline.seq.C, + adjust = adjust, + print = print, + spline = spline, + anovas = anovas, + knots = knts, + spline.dependent = dependent.splines, + p.values = p) + output +} + +# input data and argument list. replaces print in upper environment with name a vector. +data_list <- function( data, arg.list, env ) { + if(missing(env)){ + arg.list <- substitute(arg.list) + env <- parent.frame() + } + d <- data.table(data) + + l <- eval(arg.list, envir = d, enclos = parent.frame()) + + if( is.list( l ) ) { + n <- intersect(names(l), names(d)) + if(length(n)>0){ + d[,(n) := NULL] + } + # if(is.null(names(l))) { + # v <- 1:length(l) + # setnames(l, v, paste0('V', v)) + # } + l <- as.data.table(l) + l <- data.table(l) + assign('print', colnames(l), envir = env) # set names to parent environment + if( ncol(d) > 0) { + l <- data.table(d, l) + } + return(l) + } else { + return(data) + } +} + +#' @export +coef.sir <- function(object, ...) { + factors <- attr(object, 'sir.meta')$print + + q <- paste("paste(",paste(factors,collapse=","),", sep = ':')") + q <- parse(text=q) + n <- object[,eval(q)] + + res <- object$sir + attr(res, 'names') <- n + + res +} + + + + +#' @export +confint.sir <- function(object, parm, level = 0.95, conf.type = 'profile', + test.type = 'homogeneity', ...) { + + meta <- attr(object, 'sir.meta') + object <- copy(object) + object <- sir_est(table = object, + print = meta$print, + adjust = NULL, + conf.type = conf.type, + test.type = test.type, + conf.level = level, + EAR = FALSE) + object <- object$table + q <- paste("paste(",paste(meta$print,collapse=","),", sep = ':')") + q <- parse(text=q) + n <- object[,eval(q)] + + res <- cbind(object$sir.lo, object$sir.hi) + + rownames(res) <- n + colnames(res) <- paste( c( (1-level)/2*100, (1 - (1-level)/2)*100), '%') + + res +} + + +#' @title Calculate SMR +#' @author Matti Rantanen +#' @description Calculate Standardized Mortality Ratios (SMRs) using +#' a single data set that includes +#' observed and expected cases and additionally person-years. +#' +#' @details These functions are intended to calculate SMRs from a single data set +#' that includes both observed and expected number of cases. For example utilizing the +#' argument \code{pop.haz} of the \code{\link{lexpand}}. +#' +#' \code{sir_lex} automatically exports the transition \code{fromXtoY} using the first +#' state in \code{lex.Str} as \code{0} and all other as \code{1}. No missing values +#' is allowed in observed, pop.haz or person-years. +#' +#' @param x Data set e.g. \code{aggre} or \code{Lexis} object +#' (see: \code{\link{lexpand}}) +#' @param obs Variable name of the observed cases in the data set +#' @param exp Variable name or expression for expected cases +#' @param pyrs Variable name for person-years (optional) +#' @param print Variables or expression to stratify the results +#' @param test.type Test for equal SIRs. Test available are 'homogeneity' and 'trend' +#' @param conf.level Level of type-I error in confidence intervals, default 0.05 is 95\% CI +#' @param conf.type select confidence interval type: (default=) `profile`, `wald`, `univariate` +#' @param subset a logical vector for subsetting data +#' +#' @seealso \code{\link{lexpand}} +#' \href{../doc/sir.html}{A SIR calculation vignette} +#' @family sir functions +#' +#' @return A sir object +#' +#' @examples +#' +#' \dontrun{ +#' BL <- list(fot = 0:5, per = c("2003-01-01","2008-01-01", "2013-01-01")) +#' +#' ## Aggregated data +#' x1 <- lexpand(sire, breaks = BL, status = status != 0, +#' birth = bi_date, entry = dg_date, exit = ex_date, +#' pophaz=popmort, +#' aggre=list(sex, period = per, surv.int = fot)) +#' sir_ag(x1, print = 'period') +#' +#' +#' # no aggreate or breaks +#' x2 <- lexpand(sire, status = status != 0, +#' birth = bi_date, entry = dg_date, exit = ex_date, +#' pophaz=popmort) +#' sir_lex(x2, breaks = BL, print = 'per') +#' } +#' +#' @import data.table +#' @import stats +#' @export +sir_exp <- function(x, obs, exp, pyrs=NULL, print = NULL, + conf.type = 'profile', test.type = 'homogeneity', + conf.level = 0.95, subset = NULL) { + + # subsetting + subset <- substitute(subset) + subset <- evalLogicalSubset(data = x, substiset = subset) + x <- x[subset,] + + # evalPopArg + obs <- substitute(obs) + c.obs <- evalPopArg(data = x, arg = obs) + obs <- names(c.obs) + + + print <- substitute(print) + c.pri <- evalPopArg(data = x, arg = print) + print <- names(c.pri) + + exp <- substitute(exp) + c.exp <- evalPopArg(data = x, arg = exp) + exp <- names(c.exp) + + pyrs <- substitute(pyrs) + c.pyr <- evalPopArg(data = x, arg = pyrs) + if(is.null(c.pyr)) c.pyr <- data.table(pyrs=0) + pyrs <- names(c.pyr) + + # collect data + x <- cbind(c.obs, c.pyr, c.exp) + if(any(is.na(x))) stop('Missing values in expected cases.') + if(!is.null(print)) x<- cbind(x, c.pri) + + express <- paste0('list(observed = sum(', obs, '), expected = sum(',exp,'), pyrs = sum(', pyrs,'))') + # aggregate + es <- parse(text = express) + y <- x[, eval(es), keyby = print] # keyby is must + + results <- sir_est( table = y, + print = print, + adjust = NULL, + conf.type = conf.type, + test.type = test.type, + conf.level = conf.level, + EAR = FALSE) + + #setDT(data) + if (!return_DT()) { + for (i in 1:2) { + if (!is.null(results[[i]])) { + setDFpe(results[[i]]) + } + } + } + + data <- copy(results[[2]]) + setattr(data, name = 'sir.meta', value = list(adjust = NULL, + print = print, + call = match.call(), + lrt.test= results$'lrt.test', + conf.type = results$'conf.type', + conf.level = conf.level, + lrt.test.type = results$'test.type', + pooled.sir = results[[1]])) + setattr(data, "class", c("sir", "data.table", "data.frame")) + return(data) +} + + + +#' Calculate SMRs from a split Lexis object +#' +#' @description \code{sir_lex} solves SMR from a \code{\link{Lexis}} object +#' calculated with \code{lexpand}. +#' +#' @param breaks a named list to split age group (age), period (per) or follow-up (fot). +#' @param ... pass arguments to \code{sir_exp} +#' +#' +#' @describeIn sir_exp +#' +#' @export + +sir_lex <- function(x, print = NULL, breaks = NULL, ... ) { + + ## R CMD CHECK appeasement + lex.dur <- NULL + + if(!inherits(x, 'Lexis')) { + stop('x has to be a Lexis object (see lexpand or Lexis)') + } + if(!"pop.haz" %in% names(x)) { + stop("Variable pop.haz not found in the data.") + } + + + # reformat date breaks + if(!is.null(breaks)) { + breaks <- lapply(breaks, function(x) { + if(is.character(x)) c(cal.yr(as.Date(x))) + else x + }) + } + + print <- substitute(print) + # copy to retain the attributes + x <- copy(x) + + # guess the first value + first_value <- lapply(c("lex.Cst", "lex.Xst"), function(var) { + if (is.factor(x[[var]])) levels(x[[var]]) else sort(unique(x[[var]])) + }) + first_value <- unique(unlist(first_value))[1] + + col <- x$lex.Xst + set(x, j = "lex.Cst", value = 0L) + set(x, j = "lex.Xst", value = ifelse(col == first_value, 0L, 1L)) + + if(!is.null(breaks)) { + x <- splitMulti(x, breaks = breaks) + } + + a <- copy(attr(x, "time.scales")) + a <- a[!vapply(get_breaks(x), is.null, logical(1))] + x[, d.exp := pop.haz*lex.dur] + + TF <- environment() + + if(any(is.na(x[,d.exp]))) stop('Missing values in either pop.haz or lex.dur.') + x <- aggre(x, by = TF$a, sum.values = 'd.exp') + if(!'from0to1' %in% names(x)) { + stop('Could not find any transitions between states in lexis') + } + x <- sir_exp(x = x, obs = 'from0to1', print = print, exp = 'd.exp', pyrs = 'pyrs', ...) + # override the match.call from sir_exp + attr(x, 'sir.meta')$call <- match.call() + return(x) +} + + +#' SMR method for an \code{aggre} object. +#' +#' @description \code{sir_ag} solves SMR from a \code{\link{aggre}} object +#' calculated using \code{\link{lexpand}}. +#' +#' @describeIn sir_exp +#' +#' @export + +sir_ag <- function(x, obs = 'from0to1', print = attr(x, 'aggre.meta')$by, exp = 'd.exp', pyrs = 'pyrs', ... ) { + + if(!inherits(x, 'aggre')) { + stop('x should be an aggre object (see lexpand or sir_lex)') + } + obs <- substitute(obs) + print <- substitute(print) + + x <- copy(x) + x <- sir_exp(x = x, obs = obs, print = print, exp = 'd.exp', pyrs = 'pyrs', ...) # original + attr(x, 'sir.meta')$call <- match.call() # override the call from sir_exp + x +} + + + +globalVariables(c('observed','expected','p_adj','p_value','temp','coh.observations','coh.personyears', + 'd.exp', 'lower', 'pop.haz', 'sir.hi','sir.lo','upper')) + diff --git a/R/sir_utils.R b/R/sir_utils.R index c10a41a..24ed1f4 100644 --- a/R/sir_utils.R +++ b/R/sir_utils.R @@ -1,113 +1,113 @@ -#' @title Confidence intervals for the ratio of two SIRs/SMRs -#' @author Matti Rantanen -#' @description Calculate ratio of two SIRs/SMRs and the confidence intervals of the ratio. -#' -#' @details Function works with pooled sir-objects i.e. the \code{print} argument in \code{sir} is ignored. -#' Also \code{x} and \code{y} can be a vector of two where first index is the -#' observed cases and second is expected cases (see examples). -#' Note that the ratio of two SIR's is only applicable when the age distributions are similar -#' in both populations. -#' -#' \strong{Formula} -#' -#' The observed number of first sir \code{O1} is considered as a Binomial variable with sample -#' size of \code{O1+O2}. The confidence intervals for Binomial proportion \code{A} -#' is solved using \code{exact} or \code{asymptotic} -#' method. Now the CI for ratio \code{O1/O2} is \code{B = A/(1 - A)}. And further the CI for SIR/SMR -#' is B*E2/E1. (Ederer and Mantel) -#' -#' @param x a sir-object or a vector of two; observed and expected cases. -#' @param y a sir-object or a vector of two; observed and expected cases. -#' @param conf.level the type-I error in confidence intervals, default 0.95 for 95\% CI. -#' @param type How the binomial confidence intervals are calculated (default:) \code{exact} or \code{asymptotic}. -#' @param alternative The null-hypothesis test: (default:) \code{two.sided}, \code{less}, \code{greater} -#' @param digits number of digits in the output -#' -#' @note -#' Parameter \code{alternative} is always \code{two.sided} when parameter -#' \code{type} is set to \code{asymptotic}. -#' -#' @examples -#' ## Ratio for sir-object and the same values given manually: -#' -#' -#' ## create example dataset -#' dt1 <- data.frame(obs = rep(c(5,7), 10), -#' pyrs = rep(c(250,300,350,400), 5), -#' var = 1:20) -#' Ref <- data.frame(obs = rep(c(50,70,80,100), 5), -#' pyrs = rep(c(2500,3000,3500,4000), 5), -#' var = 1:20) -#' ## sir using the function -#' s1 <- sir(coh.data = dt1, coh.obs = obs, coh.pyrs = pyrs, -#' ref.data = Ref, ref.obs = obs, ref.pyrs = pyrs, -#' adjust = var) -#' -#' ## Ratio is simply 1: -#' sir_ratio(s1, c(120, 150)) -#' -#' @seealso \code{\link{sir}} -#' \href{../doc/sir.html}{A SIR calculation vignette} -#' -#' @references Statistics with Confidence: Confidence Intervals and Statistical Guidelines, Douglas Altman -#' -#' @family sir functions -#' -#' @return A vector length of three: sir_ratio, and lower and upper confidence intervals. -#' -#' @export sir_ratio -#' -#' @import data.table -#' @import stats - - -sir_ratio <- function(x, y, digits = 3, alternative = 'two.sided', - conf.level = 0.95, type = 'exact') { - # prepare input values: x - # Tests are located in test_sir script. - if(inherits(x = x, what = 'sir')){ - O1 <- sum(x$observed) - E1 <- sum(x$expected) - } - else if(is.vector(x) && length(x) == 2) { - O1 <- x[1] - E1 <- x[2] - } - else{ - stop('Input x is not correct: x is neighter a vector of 2 nor sir-object') - } - # prepare y: - if(inherits(y,'sir')){ - O2 <- sum(y$observed) - E2 <- sum(y$expected) - } - else if(is.vector(y) && length(y) == 2) { - O2 <- y[1] - E2 <- y[2] - } - else{ - stop('Input y is not correct: y is neighter a vector of 2 nor sir-object') - } - - type <- match.arg(type, c('asymptotic', 'exact'), several.ok = FALSE) - alternative <- match.arg(alternative, c('two.sided','less', 'greater'), several.ok = FALSE) - # conf.level - - p <- O1/(O1+O2) - if(type == 'asymptotic') { - alpha <- (1 - conf.level)/2 - Ex <- p + c(-qnorm(1-alpha),qnorm(1-alpha)) * sqrt((1/(O1+O2))*p*(1-p)) - if( alternative != 'two.sided') { - message('Test changed to two.sided when asymptotic.') - alternative <- 'two.sided' - } - } - if(type == 'exact') { - Ex <- binom.test(c(O1,O2), p = 0.5, alternative = alternative, conf.level = conf.level)$conf.int - } - B = Ex/(1-Ex) - - res <- round(c(sir_ratio = (O1/E1)/(O2/E2), lower=(B*(E2/E1))[1], upper = (B*(E2/E1))[2]), digits = digits) - return(res) -} - +#' @title Confidence intervals for the ratio of two SIRs/SMRs +#' @author Matti Rantanen +#' @description Calculate ratio of two SIRs/SMRs and the confidence intervals of the ratio. +#' +#' @details Function works with pooled sir-objects i.e. the \code{print} argument in \code{sir} is ignored. +#' Also \code{x} and \code{y} can be a vector of two where first index is the +#' observed cases and second is expected cases (see examples). +#' Note that the ratio of two SIR's is only applicable when the age distributions are similar +#' in both populations. +#' +#' \strong{Formula} +#' +#' The observed number of first sir \code{O1} is considered as a Binomial variable with sample +#' size of \code{O1+O2}. The confidence intervals for Binomial proportion \code{A} +#' is solved using \code{exact} or \code{asymptotic} +#' method. Now the CI for ratio \code{O1/O2} is \code{B = A/(1 - A)}. And further the CI for SIR/SMR +#' is B*E2/E1. (Ederer and Mantel) +#' +#' @param x a sir-object or a vector of two; observed and expected cases. +#' @param y a sir-object or a vector of two; observed and expected cases. +#' @param conf.level the type-I error in confidence intervals, default 0.95 for 95\% CI. +#' @param type How the binomial confidence intervals are calculated (default:) \code{exact} or \code{asymptotic}. +#' @param alternative The null-hypothesis test: (default:) \code{two.sided}, \code{less}, \code{greater} +#' @param digits number of digits in the output +#' +#' @note +#' Parameter \code{alternative} is always \code{two.sided} when parameter +#' \code{type} is set to \code{asymptotic}. +#' +#' @examples +#' ## Ratio for sir-object and the same values given manually: +#' +#' +#' ## create example dataset +#' dt1 <- data.frame(obs = rep(c(5,7), 10), +#' pyrs = rep(c(250,300,350,400), 5), +#' var = 1:20) +#' Ref <- data.frame(obs = rep(c(50,70,80,100), 5), +#' pyrs = rep(c(2500,3000,3500,4000), 5), +#' var = 1:20) +#' ## sir using the function +#' s1 <- sir(coh.data = dt1, coh.obs = obs, coh.pyrs = pyrs, +#' ref.data = Ref, ref.obs = obs, ref.pyrs = pyrs, +#' adjust = var) +#' +#' ## Ratio is simply 1: +#' sir_ratio(s1, c(120, 150)) +#' +#' @seealso \code{\link{sir}} +#' \href{../doc/sir.html}{A SIR calculation vignette} +#' +#' @references Statistics with Confidence: Confidence Intervals and Statistical Guidelines, Douglas Altman +#' +#' @family sir functions +#' +#' @return A vector length of three: sir_ratio, and lower and upper confidence intervals. +#' +#' @export sir_ratio +#' +#' @import data.table +#' @import stats + + +sir_ratio <- function(x, y, digits = 3, alternative = 'two.sided', + conf.level = 0.95, type = 'exact') { + # prepare input values: x + # Tests are located in test_sir script. + if(inherits(x = x, what = 'sir')){ + O1 <- sum(x$observed) + E1 <- sum(x$expected) + } + else if(is.vector(x) && length(x) == 2) { + O1 <- x[1] + E1 <- x[2] + } + else{ + stop('Input x is not correct: x is neighter a vector of 2 nor sir-object') + } + # prepare y: + if(inherits(y,'sir')){ + O2 <- sum(y$observed) + E2 <- sum(y$expected) + } + else if(is.vector(y) && length(y) == 2) { + O2 <- y[1] + E2 <- y[2] + } + else{ + stop('Input y is not correct: y is neighter a vector of 2 nor sir-object') + } + + type <- match.arg(type, c('asymptotic', 'exact'), several.ok = FALSE) + alternative <- match.arg(alternative, c('two.sided','less', 'greater'), several.ok = FALSE) + # conf.level + + p <- O1/(O1+O2) + if(type == 'asymptotic') { + alpha <- (1 - conf.level)/2 + Ex <- p + c(-qnorm(1-alpha),qnorm(1-alpha)) * sqrt((1/(O1+O2))*p*(1-p)) + if( alternative != 'two.sided') { + message('Test changed to two.sided when asymptotic.') + alternative <- 'two.sided' + } + } + if(type == 'exact') { + Ex <- binom.test(c(O1,O2), p = 0.5, alternative = alternative, conf.level = conf.level)$conf.int + } + B = Ex/(1-Ex) + + res <- round(c(sir_ratio = (O1/E1)/(O2/E2), lower=(B*(E2/E1))[1], upper = (B*(E2/E1))[2]), digits = digits) + return(res) +} + diff --git a/R/splitLexisDT.R b/R/splitLexisDT.R index 08cef4e..b75cb79 100644 --- a/R/splitLexisDT.R +++ b/R/splitLexisDT.R @@ -1,301 +1,301 @@ -#' @title Split case-level observations -#' @author Joonas Miettinen -#' @description Split a \code{Lexis} object along one time scale -#' (as \code{\link[Epi]{splitLexis}}) with speed -#' @param lex a Lexis object, split or not -#' @param breaks a vector of \code{[a,b)} breaks to split \code{data} by -#' @param timeScale a character string; name of the time scale to split by -#' @param merge logical; if \code{TRUE}, retains all variables -#' from the original data - i.e. original variables are -#' repeated for all the rows by original subject -#' @param drop logical; if \code{TRUE}, drops all resulting rows -#' after expansion that reside outside the time window -#' defined by the given breaks -#' -#' -#' @details -#' -#' \code{splitLexisDT} is in essence a \pkg{data.table} version of -#' \code{splitLexis} or \code{survSplit} for splitting along a single -#' time scale. It requires a Lexis object as input, which may have already -#' been split along some time scale. -#' -#' Unlike \code{splitLexis}, \code{splitLexisDT} drops observed time outside -#' the roof and floor of \code{breaks} by default - with \code{drop = FALSE} -#' the functions have identical behaviour. -#' -#' The \code{Lexis} time scale variables can be of any arbitrary -#' format, e.g. \code{Date}, -#' fractional years (see \code{\link[Epi]{cal.yr}}) and \code{\link{get.yrs}}, -#' or other. However, using \code{date} variables (from package \pkg{date}) -#' are not recommended, as \code{date} variables are always stored as integers, -#' whereas \code{Date} variables (see \code{?as.Date}) are typically stored -#' in double ("numeric") format. This allows for breaking days into fractions -#' as well, when using e.g. hypothetical years of 365.25 days. -#' -#' @return -#' A \code{data.table} or \code{data.frame} -#' (depending on \code{options("popEpi.datatable")}; see \code{?popEpi}) -#' object expanded to accommodate split observations. -#' -#' @export -#' @family splitting functions -#' @examples -#' library(Epi) -#' data("sire", package = "popEpi") -#' x <- Lexis(data=sire[1000:1100, ], -#' entry = list(fot=0, per=get.yrs(dg_date), age=dg_age), -#' exit=list(per=get.yrs(ex_date)), exit.status=status) -#' BL <- list(fot=seq(0, 5, by = 3/12), per=c(2008, 2013)) -#' -#' x2 <- splitMulti(x, breaks = BL, drop = FALSE) -#' -#' x3 <- splitLexisDT(x, breaks = BL$fot, timeScale = "fot", drop = FALSE) -#' x3 <- splitLexisDT(x3, breaks = BL$per, timeScale = "per", drop = FALSE) -#' -#' x4 <- splitLexis(x, breaks = BL$fot, time.scale = "fot") -#' x4 <- splitLexis(x4, breaks = BL$per, time.scale = "per") -#' ## all produce identical results -#' -#' ## using Date variables -#' x <- Lexis(data=sire[1000:1100, ], -#' entry = list(fot=0, per=dg_date, age=dg_date-bi_date), -#' exit=list(per=ex_date), exit.status=status) -#' BL <- list(fot = 0:5*365.25, per = as.Date(c("2008-01-01", "2013-01-01"))) -#' -#' x2 <- splitMulti(x, breaks = BL, drop = FALSE) -#' -#' x3 <- splitLexisDT(x, breaks = BL$fot, timeScale = "fot", drop = FALSE) -#' x3 <- splitLexisDT(x3, breaks = BL$per, timeScale = "per", drop = FALSE) -#' -#' ## splitLexis may not work when using Dates -splitLexisDT <- function(lex, breaks, timeScale, merge = TRUE, drop = TRUE) { - - do_split <- TRUE - - tol <- .Machine$double.eps^0.5 - checkLexisData(lex, check.breaks = FALSE) - - attr_list <- copy(attributes(lex)[c("time.scales", "breaks", "time.since")]) - allScales <- attr_list[["time.scales"]] - allBreaks <- attr_list[["breaks"]] - - if (!timeScale %in% allScales) { - stop("timeScale '", timeScale,"' not among following existing time scales: ", - paste0("'", allScales, "'", collapse = ", ")) - } - - ## lexVars: if !merge, will drop all but these (NOTE: checkLexisData - ## check for existence of these) - lexVars <- c("lex.id", "lex.multi", allScales, "lex.dur", "lex.Cst", "lex.Xst") - lexVars <- intersect(lexVars, names(lex)) - othVars <- setdiff(names(lex), lexVars) - - ## basic checks on breaks - if (drop && length(breaks) == 1L) { - stop("Length of breaks vector is one, but argument 'drop' is TRUE. ", - "Cannot do dropping with only one break. Either supply at least ", - "two breaks or set drop = FALSE.") - } - if (length(breaks) == 0L) { - stop("No breaks supplied (length of breaks is zero).") - } - - ## remove any existing breaks already split by; - ## NOTE: setdiff would break Date format breaks! - orig_breaks <- copy(breaks) - if (length(allBreaks[[timeScale]])) { - ## because any test like (x %in% NULL) results in FALSE. - breaks <- breaks[!breaks %in% allBreaks[[timeScale]]] - } - - breaks <- matchBreakTypes(lex, breaks, timeScale, modify.lex = FALSE) - - if (length(breaks) == 0L || (length(orig_breaks) == 2L && drop)) { - ## former means no additional splitting to do. (we still crop & drop - ## if argument drop = TRUE) - ## latter means we only need to crop & drop. - do_split <- FALSE - breaks <- orig_breaks - } - - breaks <- sort(breaks) - if (!drop) breaks <- protectFromDrop(breaks) - - BL <- list(breaks) - setattr(BL, "names", timeScale) - checkBreaksList(x = lex, breaks = BL) - - - ## use subset lex if dropping for efficiency - orig_lex <- lex - if (drop) { - keepVars <- if (merge) NULL else lexVars ## NULL: all vars - lex <- subsetDTorDF(lex, select = keepVars) - rm(keepVars) - lex <- data.table(lex) - - setattr(lex, "class", c("Lexis", "data.table", "data.frame")) - - lex <- intelliCrop(lex, breaks = BL, allScales = allScales, - cropStatuses = TRUE, tol = tol) - lex <- intelliDrop(lex, breaks = BL, dropNegDur = TRUE, - check = FALSE, tol = tol) - } - - if (!do_split) { - - l <- if (!drop) copy(lex) else lex - - } else { - - ## currently cannot handle NA values in split time scale; will add them in - ## the end - ts_is_na <- is.na(lex[[timeScale]]) - ts_any_na <- any(ts_is_na) - if (ts_any_na) { - warning("NA values in the time scale you are splitting along ('", - timeScale,"'). Results may deviate from that produced by ", - "splitLexis from package Epi. For safety you may want to split ", - "using only the data with no NA values and combine the the split", - " data with the NA-valued data using rbind.") - lex_na <- lex[ts_is_na, ] - lex <- lex[!ts_is_na, ] - } - - - ## will use this due to step below (and laziness) - ts_values <- lex[[timeScale]] - ## Date objects are based on doubles and therefore keep the most information - if (inherits(ts_values, c("IDate", "date", "dates"))) ts_values <- as.Date(ts_values) - - N_expand <- length(breaks) - N_subjects <- nrow(lex) - - ## use tmp id to ensure correct status rolling ----------------------------- - id_dt <- data.table( - tmp_id_values = 1:nrow(lex), - orig_id_values = lex[["lex.id"]] - ) - on.exit(set(lex, j = "lex.id", value = id_dt[["orig_id_values"]])) - set(lex, j = "lex.id", value = id_dt[["tmp_id_values"]]) - - ## quick data expansion ------------------------------------------------------ - - l <- vector(mode = "list", length = N_expand) - l[[1]] <- data.table(lex) - - if (!merge) setcolsnull(l[[1]], keep = lexVars, soft = FALSE) - - tmpID <- makeTempVarName(data = l[[1]], pre = "TEMP_SPLITTING_ID") - tmpIE <- makeTempVarName(data = l[[1]], pre = "TEMP_SPLIT_INT_END") - - set(l[[1]], j = tmpID, value = 1:nrow(l[[1]])) - if (N_expand > 1L) { - for (k in 2:(N_expand)) { - l[[k]] <- l[[1]] - } - } - - l <- rbindlist(l) - - ## time scale value determination -------------------------------------------- - set(l, j = tmpIE, value = rep(breaks, each = N_subjects)) - set(l, j = tmpIE, value = pmin(l[[tmpIE]], l[[timeScale]] + l$lex.dur) ) - set(l, j = timeScale, value = c( - ts_values, - pmax(ts_values, rep(breaks[-length(breaks)], each = N_subjects)) - )) - - set(l, j = "lex.dur", value = l[[tmpIE]] - l[[timeScale]] ) - - ## other time scale values --------------------------------------------------- - otherScales <- setdiff(allScales, timeScale) - if (length(otherScales) > 0) { - ## change in timeScale - ts_delta <- l[[timeScale]] - ts_values - for (k in otherScales) { - set(l, j = k, value = lex[[k]] + ts_delta) - } - } - - ## dropping ---------------------------------------------------------------- - ## drops very very small intervals as well as dur <= 0 - has_zero_dur <- l[["lex.dur"]] < tol - if (any(has_zero_dur)) { - l <- l[!has_zero_dur] - } - - - - ## roll states ------------------------------------------------------------- - # this avoids duplicate deaths, etc., where appropriate. - setkeyv(l, c("lex.id", timeScale)) - lex_id <- mget_cols(c("lex.Cst", "lex.Xst", "lex.id"), data = lex) - setattr(lex_id, "time.scales", allScales) - roll_lexis_status_inplace( - unsplit.data = lex_id, split.data = l, id.var = "lex.id" - ) - rm("lex_id") - - - set(l, j = c(tmpIE, tmpID), value = NULL) - - ## revert to original IDs -------------------------------------------------- - set(l, j = "lex.id", value = { - id_dt[ - i = list(tmp_id_values = l$lex.id), - j = .SD, - on = "tmp_id_values", - .SDcols = "orig_id_values" - ] - }) - - if (ts_any_na) { - l <- rbind(l, lex_na) - setkeyv(l, c("lex.id", timeScale)) - } - - } - - ## harmonize statuses -------------------------------------------------------- - harmonizeStatuses(x = l, C = "lex.Cst", X = "lex.Xst") - - - ## ensure time scales and lex.dur have same (ish) class as before ------------ - for (k in c(allScales, "lex.dur")) { - - if (inherits(orig_lex[[k]], "difftime") && !inherits(l[[k]], "difftime")){ - setattr(l[[k]], "class", "difftime") - setattr(l[[k]], "units", attr(orig_lex[[k]], "units")) - } else if (is.numeric(orig_lex[[k]]) && inherits(l[[k]], "difftime")) { - set(l, j = k, value = as.numeric(l[[k]])) - } - - } - - ## harmonize time scales ----------------------------------------------------- - ## numeric time scales are forced to the lowest common denominator: - ## difftime -> integer -> double (though difftime is not numeric class) - harmonizeNumericTimeScales(l, times = c(allScales, "lex.dur")) - - - ## final touch & attributes -------------------------------------------------- - setcolorder(l, neworder = intersect(c(lexVars, othVars), names(l))) #merge=T/F - if (!drop) breaks <- unprotectFromDrop(breaks) - allBreaks[[timeScale]] <- sort(unique(c(allBreaks[[timeScale]], breaks))) - - allBreaks <- lapply(allScales, function(scale_nm) { - allBreaks[[scale_nm]] ## intentionally NULL if not there - }) - names(allBreaks) <- allScales - - setattr(l, "breaks", allBreaks) - setattr(l, "time.scales", allScales) - setattr(l, "time.since", attr_list[["time.since"]]) - setattr(l, "class", c("Lexis","data.table","data.frame")) - if (!return_DT()) setDFpe(l) - - l[] -} - +#' @title Split case-level observations +#' @author Joonas Miettinen +#' @description Split a \code{Lexis} object along one time scale +#' (as \code{\link[Epi]{splitLexis}}) with speed +#' @param lex a Lexis object, split or not +#' @param breaks a vector of \code{[a,b)} breaks to split \code{data} by +#' @param timeScale a character string; name of the time scale to split by +#' @param merge logical; if \code{TRUE}, retains all variables +#' from the original data - i.e. original variables are +#' repeated for all the rows by original subject +#' @param drop logical; if \code{TRUE}, drops all resulting rows +#' after expansion that reside outside the time window +#' defined by the given breaks +#' +#' +#' @details +#' +#' \code{splitLexisDT} is in essence a \pkg{data.table} version of +#' \code{splitLexis} or \code{survSplit} for splitting along a single +#' time scale. It requires a Lexis object as input, which may have already +#' been split along some time scale. +#' +#' Unlike \code{splitLexis}, \code{splitLexisDT} drops observed time outside +#' the roof and floor of \code{breaks} by default - with \code{drop = FALSE} +#' the functions have identical behaviour. +#' +#' The \code{Lexis} time scale variables can be of any arbitrary +#' format, e.g. \code{Date}, +#' fractional years (see \code{\link[Epi]{cal.yr}}) and \code{\link{get.yrs}}, +#' or other. However, using \code{date} variables (from package \pkg{date}) +#' are not recommended, as \code{date} variables are always stored as integers, +#' whereas \code{Date} variables (see \code{?as.Date}) are typically stored +#' in double ("numeric") format. This allows for breaking days into fractions +#' as well, when using e.g. hypothetical years of 365.25 days. +#' +#' @return +#' A \code{data.table} or \code{data.frame} +#' (depending on \code{options("popEpi.datatable")}; see \code{?popEpi}) +#' object expanded to accommodate split observations. +#' +#' @export +#' @family splitting functions +#' @examples +#' library(Epi) +#' data("sire", package = "popEpi") +#' x <- Lexis(data=sire[1000:1100, ], +#' entry = list(fot=0, per=get.yrs(dg_date), age=dg_age), +#' exit=list(per=get.yrs(ex_date)), exit.status=status) +#' BL <- list(fot=seq(0, 5, by = 3/12), per=c(2008, 2013)) +#' +#' x2 <- splitMulti(x, breaks = BL, drop = FALSE) +#' +#' x3 <- splitLexisDT(x, breaks = BL$fot, timeScale = "fot", drop = FALSE) +#' x3 <- splitLexisDT(x3, breaks = BL$per, timeScale = "per", drop = FALSE) +#' +#' x4 <- splitLexis(x, breaks = BL$fot, time.scale = "fot") +#' x4 <- splitLexis(x4, breaks = BL$per, time.scale = "per") +#' ## all produce identical results +#' +#' ## using Date variables +#' x <- Lexis(data=sire[1000:1100, ], +#' entry = list(fot=0, per=dg_date, age=dg_date-bi_date), +#' exit=list(per=ex_date), exit.status=status) +#' BL <- list(fot = 0:5*365.25, per = as.Date(c("2008-01-01", "2013-01-01"))) +#' +#' x2 <- splitMulti(x, breaks = BL, drop = FALSE) +#' +#' x3 <- splitLexisDT(x, breaks = BL$fot, timeScale = "fot", drop = FALSE) +#' x3 <- splitLexisDT(x3, breaks = BL$per, timeScale = "per", drop = FALSE) +#' +#' ## splitLexis may not work when using Dates +splitLexisDT <- function(lex, breaks, timeScale, merge = TRUE, drop = TRUE) { + + do_split <- TRUE + + tol <- .Machine$double.eps^0.5 + checkLexisData(lex, check.breaks = FALSE) + + attr_list <- copy(attributes(lex)[c("time.scales", "breaks", "time.since")]) + allScales <- attr_list[["time.scales"]] + allBreaks <- attr_list[["breaks"]] + + if (!timeScale %in% allScales) { + stop("timeScale '", timeScale,"' not among following existing time scales: ", + paste0("'", allScales, "'", collapse = ", ")) + } + + ## lexVars: if !merge, will drop all but these (NOTE: checkLexisData + ## check for existence of these) + lexVars <- c("lex.id", "lex.multi", allScales, "lex.dur", "lex.Cst", "lex.Xst") + lexVars <- intersect(lexVars, names(lex)) + othVars <- setdiff(names(lex), lexVars) + + ## basic checks on breaks + if (drop && length(breaks) == 1L) { + stop("Length of breaks vector is one, but argument 'drop' is TRUE. ", + "Cannot do dropping with only one break. Either supply at least ", + "two breaks or set drop = FALSE.") + } + if (length(breaks) == 0L) { + stop("No breaks supplied (length of breaks is zero).") + } + + ## remove any existing breaks already split by; + ## NOTE: setdiff would break Date format breaks! + orig_breaks <- copy(breaks) + if (length(allBreaks[[timeScale]])) { + ## because any test like (x %in% NULL) results in FALSE. + breaks <- breaks[!breaks %in% allBreaks[[timeScale]]] + } + + breaks <- matchBreakTypes(lex, breaks, timeScale, modify.lex = FALSE) + + if (length(breaks) == 0L || (length(orig_breaks) == 2L && drop)) { + ## former means no additional splitting to do. (we still crop & drop + ## if argument drop = TRUE) + ## latter means we only need to crop & drop. + do_split <- FALSE + breaks <- orig_breaks + } + + breaks <- sort(breaks) + if (!drop) breaks <- protectFromDrop(breaks) + + BL <- list(breaks) + setattr(BL, "names", timeScale) + checkBreaksList(x = lex, breaks = BL) + + + ## use subset lex if dropping for efficiency + orig_lex <- lex + if (drop) { + keepVars <- if (merge) NULL else lexVars ## NULL: all vars + lex <- subsetDTorDF(lex, select = keepVars) + rm(keepVars) + lex <- data.table(lex) + + setattr(lex, "class", c("Lexis", "data.table", "data.frame")) + + lex <- intelliCrop(lex, breaks = BL, allScales = allScales, + cropStatuses = TRUE, tol = tol) + lex <- intelliDrop(lex, breaks = BL, dropNegDur = TRUE, + check = FALSE, tol = tol) + } + + if (!do_split) { + + l <- if (!drop) copy(lex) else lex + + } else { + + ## currently cannot handle NA values in split time scale; will add them in + ## the end + ts_is_na <- is.na(lex[[timeScale]]) + ts_any_na <- any(ts_is_na) + if (ts_any_na) { + warning("NA values in the time scale you are splitting along ('", + timeScale,"'). Results may deviate from that produced by ", + "splitLexis from package Epi. For safety you may want to split ", + "using only the data with no NA values and combine the the split", + " data with the NA-valued data using rbind.") + lex_na <- lex[ts_is_na, ] + lex <- lex[!ts_is_na, ] + } + + + ## will use this due to step below (and laziness) + ts_values <- lex[[timeScale]] + ## Date objects are based on doubles and therefore keep the most information + if (inherits(ts_values, c("IDate", "date", "dates"))) ts_values <- as.Date(ts_values) + + N_expand <- length(breaks) + N_subjects <- nrow(lex) + + ## use tmp id to ensure correct status rolling ----------------------------- + id_dt <- data.table( + tmp_id_values = 1:nrow(lex), + orig_id_values = lex[["lex.id"]] + ) + on.exit(set(lex, j = "lex.id", value = id_dt[["orig_id_values"]])) + set(lex, j = "lex.id", value = id_dt[["tmp_id_values"]]) + + ## quick data expansion ------------------------------------------------------ + + l <- vector(mode = "list", length = N_expand) + l[[1]] <- data.table(lex) + + if (!merge) setcolsnull(l[[1]], keep = lexVars, soft = FALSE) + + tmpID <- makeTempVarName(data = l[[1]], pre = "TEMP_SPLITTING_ID") + tmpIE <- makeTempVarName(data = l[[1]], pre = "TEMP_SPLIT_INT_END") + + set(l[[1]], j = tmpID, value = 1:nrow(l[[1]])) + if (N_expand > 1L) { + for (k in 2:(N_expand)) { + l[[k]] <- l[[1]] + } + } + + l <- rbindlist(l) + + ## time scale value determination -------------------------------------------- + set(l, j = tmpIE, value = rep(breaks, each = N_subjects)) + set(l, j = tmpIE, value = pmin(l[[tmpIE]], l[[timeScale]] + l$lex.dur) ) + set(l, j = timeScale, value = c( + ts_values, + pmax(ts_values, rep(breaks[-length(breaks)], each = N_subjects)) + )) + + set(l, j = "lex.dur", value = l[[tmpIE]] - l[[timeScale]] ) + + ## other time scale values --------------------------------------------------- + otherScales <- setdiff(allScales, timeScale) + if (length(otherScales) > 0) { + ## change in timeScale + ts_delta <- l[[timeScale]] - ts_values + for (k in otherScales) { + set(l, j = k, value = lex[[k]] + ts_delta) + } + } + + ## dropping ---------------------------------------------------------------- + ## drops very very small intervals as well as dur <= 0 + has_zero_dur <- l[["lex.dur"]] < tol + if (any(has_zero_dur)) { + l <- l[!has_zero_dur] + } + + + + ## roll states ------------------------------------------------------------- + # this avoids duplicate deaths, etc., where appropriate. + setkeyv(l, c("lex.id", timeScale)) + lex_id <- mget_cols(c("lex.Cst", "lex.Xst", "lex.id"), data = lex) + setattr(lex_id, "time.scales", allScales) + roll_lexis_status_inplace( + unsplit.data = lex_id, split.data = l, id.var = "lex.id" + ) + rm("lex_id") + + + set(l, j = c(tmpIE, tmpID), value = NULL) + + ## revert to original IDs -------------------------------------------------- + set(l, j = "lex.id", value = { + id_dt[ + i = list(tmp_id_values = l$lex.id), + j = .SD, + on = "tmp_id_values", + .SDcols = "orig_id_values" + ] + }) + + if (ts_any_na) { + l <- rbind(l, lex_na) + setkeyv(l, c("lex.id", timeScale)) + } + + } + + ## harmonize statuses -------------------------------------------------------- + harmonizeStatuses(x = l, C = "lex.Cst", X = "lex.Xst") + + + ## ensure time scales and lex.dur have same (ish) class as before ------------ + for (k in c(allScales, "lex.dur")) { + + if (inherits(orig_lex[[k]], "difftime") && !inherits(l[[k]], "difftime")){ + setattr(l[[k]], "class", "difftime") + setattr(l[[k]], "units", attr(orig_lex[[k]], "units")) + } else if (is.numeric(orig_lex[[k]]) && inherits(l[[k]], "difftime")) { + set(l, j = k, value = as.numeric(l[[k]])) + } + + } + + ## harmonize time scales ----------------------------------------------------- + ## numeric time scales are forced to the lowest common denominator: + ## difftime -> integer -> double (though difftime is not numeric class) + harmonizeNumericTimeScales(l, times = c(allScales, "lex.dur")) + + + ## final touch & attributes -------------------------------------------------- + setcolorder(l, neworder = intersect(c(lexVars, othVars), names(l))) #merge=T/F + if (!drop) breaks <- unprotectFromDrop(breaks) + allBreaks[[timeScale]] <- sort(unique(c(allBreaks[[timeScale]], breaks))) + + allBreaks <- lapply(allScales, function(scale_nm) { + allBreaks[[scale_nm]] ## intentionally NULL if not there + }) + names(allBreaks) <- allScales + + setattr(l, "breaks", allBreaks) + setattr(l, "time.scales", allScales) + setattr(l, "time.since", attr_list[["time.since"]]) + setattr(l, "class", c("Lexis","data.table","data.frame")) + if (!return_DT()) setDFpe(l) + + l[] +} + diff --git a/R/splitMulti.R b/R/splitMulti.R index 9ae1b48..d716bb7 100644 --- a/R/splitMulti.R +++ b/R/splitMulti.R @@ -1,269 +1,269 @@ -#' @title Split case-level observations -#' @author Joonas Miettinen -#' @description Split a \code{Lexis} object along multiple time scales -#' with speed and ease -#' @param data a Lexis object with event cases as rows -#' @param breaks a list of named numeric vectors of breaks; see Details and Examples -#' @param ... alternate way of supplying breaks as named vectors; -#' e.g. \code{fot = 0:5} instead of \code{breaks = list(fot = 0:5)}; -#' if \code{breaks} is not \code{NULL}, \code{breaks} is used and any breaks -#' passed through \code{...} are NOT used -#' @param drop logical; if \code{TRUE}, drops all resulting rows -#' after expansion that reside outside the time window -#' defined by the given breaks -#' @param merge logical; if \code{TRUE}, retains all variables -#' from the original data - i.e. original variables are -#' repeated for all the rows by original subject -#' @param verbose logical; if \code{TRUE}, the function is chatty -#' and returns some messages along the way -#' -#' -#' @details -#' -#' \code{splitMulti} is in essence a \pkg{data.table} version of -#' \code{splitLexis} or \code{survSplit} for splitting along multiple -#' time scales. -#' It requires a Lexis object as input. -#' -#' The \code{breaks} must be a list of named vectors of the appropriate type. -#' The breaks are fully explicit and -#' left-inclusive and right exclusive, e.g. \code{fot=c(0,5)} -#' forces the data to only include time between -#' \code{[0,5)} for each original row (unless \code{drop = FALSE}). -#' Use \code{Inf} or \code{-Inf} for open-ended intervals, -#' e.g. \code{per=c(1990,1995,Inf)} creates the intervals -#' \code{[1990,1995), [1995, Inf)}. -#' -#' Instead of specifying \code{breaks}, one may make use of the \code{...} -#' argument to pass breaks: e.g. -#' -#' \code{splitMulti(x, breaks = list(fot = 0:5))} -#' -#' is equivalent to -#' -#' \code{splitMulti(x, fot = 0:5)}. -#' -#' Multiple breaks can be supplied in the same manner. However, if both -#' \code{breaks} and \code{...} are used, only the breaks in \code{breaks} -#' are utilized within the function. -#' -#' The \code{Lexis} time scale variables can be of any arbitrary -#' format, e.g. \code{Date}, -#' fractional years (see \code{\link[Epi]{cal.yr}}) and \code{\link{get.yrs}}, -#' or other. However, using \code{date} variables (from package \pkg{date}) -#' are not recommended, as \code{date} variables are always stored as integers, -#' whereas \code{Date} variables (see \code{?as.Date}) are typically stored -#' in double ("numeric") format. This allows for breaking days into fractions -#' as well, when using e.g. hypothetical years of 365.25 days. -#' -#' @return -#' A \code{data.table} or \code{data.frame} -#' (depending on \code{options("popEpi.datatable")}; see \code{?popEpi}) -#' object expanded to accommodate split observations. -#' -#' @examples -#' #### let's prepare data for computing period method survivals -#' #### in case there are problems with dates, we first -#' #### convert to fractional years. -#' \dontrun{ -#' library(Epi) -#' data("sire", package = "popEpi") -#' x <- Lexis(data=sire, entry = list(fot=0, per=get.yrs(dg_date), age=dg_age), -#' exit=list(per=get.yrs(ex_date)), exit.status=status) -#' x2 <- splitMulti(x, breaks = list(fot=seq(0, 5, by = 3/12), per=c(2008, 2013))) -#' # equivalently: -#' x2 <- splitMulti(x, fot=seq(0, 5, by = 3/12), per=c(2008, 2013)) -#' -#' ## using dates; note: breaks must be expressed as dates or days! -#' x <- Lexis(data=sire, entry = list(fot=0, per=dg_date, age=dg_date-bi_date), -#' exit=list(per=ex_date), exit.status=status) -#' BL <- list(fot = seq(0, 5, by = 3/12)*365.242199, -#' per = as.Date(paste0(c(1980:2014),"-01-01")), -#' age = c(0,45,85,Inf)*365.242199) -#' x2 <- splitMulti(x, breaks = BL, verbose=TRUE) -#' -#' -#' ## multistate example (healty - sick - dead) -#' sire2 <- data.frame(sire) -#' -#' set.seed(1L) -#' not_sick <- sample.int(nrow(sire2), 6000L, replace = FALSE) -#' sire2[not_sick, ]$dg_date <- NA -#' sire2[!is.na(sire2$dg_date) & sire2$status == 0, ]$status <- -1 -#' -#' sire2$status[sire2$status==2] <- 1 -#' sire2$status <- factor(sire2$status, levels = c(0, -1, 1), -#' labels = c("healthy", "sick", "dead")) -#' -#' xm <- Lexis(data=sire2, entry = list(fot=0, per=get.yrs(bi_date), age=0), -#' exit=list(per=get.yrs(ex_date)), exit.status=status) -#' xm2 <- cutLexis(xm, cut = get.yrs(xm$dg_date), -#' timescale = "per", -#' new.state = "sick") -#' xm2[xm2$lex.id == 6L, ] -#' -#' xm2 <- splitMulti(xm2, breaks = list(fot = seq(0,150,25))) -#' xm2[xm2$lex.id == 6L, ] -#' } -#' -#' @import data.table -#' @import Epi -#' -#' @export -#' @family splitting functions -#' @seealso -#' \code{\link[Epi]{splitLexis}}, \code{\link[Epi]{Lexis}}, -#' \code{\link[survival]{survSplit}} -#' -splitMulti <- function(data, - breaks = NULL, - ..., - drop=TRUE, - merge=TRUE, - verbose=FALSE) { - - lex.id <- lex.dur <- NULL ## APPEASE R CMD CHECK - - ## basic checks -------------------------------------------------------------- - if (verbose) {stime <- proc.time()} - - breaks <- splitMultiPreCheck(data = data, breaks = breaks, ...) - - ## collect necessary data ---------------------------------------------------- - attr_list <- copy(attributes(data)[c("time.scales", "breaks", "time.since")]) - allScales <- attr_list$time.scales - splitScales <- names(breaks) - - keep_nms <- if (merge) names(data) else { - intersect( - names(data), - c("lex.id", "lex.Cst", "lex.Xst", allScales) - ) - } - # this is not a copy! - dt <- mget_cols(keep_nms, data = data) - forceLexisDT(dt, breaks = attr(data, "breaks"), allScales = allScales, - key = FALSE) - - ## check if even need to do splitting ---------------------------------------- - - oldBreaks <- copy(attr(data, "breaks")) - tryCatch(checkBreaksList(data, oldBreaks), error = function(e) { - stop("Error in splitMulti: \n", - "Old breaks existing in Lexis data did not pass testing. Error ", - "message from test: \n", e, call. = FALSE) - }) - - ## only do split if all breaks are NOT in the breaks that the data - ## has already been split by. - do_split <- TRUE - do_split <- !all_breaks_in(breaks, oldBreaks, x = data) - - if (!do_split) { - l <- setDT(copy(dt)) - setkeyv(l, c("lex.id", allScales[1])) - } else { - - ## temp IDS ---------------------------------------------------------------- - # used to ensure correct splitting and lex status rolling - - id_dt <- data.table( - orig_id_values = dt$lex.id, - temp_id_values = 1:nrow(dt), - key = "temp_id_values" - ) - - on.exit(set(dt, j = "lex.id", value = id_dt[["orig_id_values"]])) - set(dt, j = "lex.id", value = id_dt[["temp_id_values"]]) - - l <- vector(mode = "list", length = length(splitScales)) - setattr(l, "names", splitScales) - for (v in splitScales) { - l[[v]] <- splitLexisDT(dt, breaks = breaks[[v]], - merge = merge, drop = FALSE, timeScale = v) - breaks[[v]] <- attr(l[[v]], "breaks")[[v]] - } - l <- rbindlist(l) - - s1 <- allScales[1] - setkeyv(l, c("lex.id", s1)) - - if (length(splitScales) > 1L) { - ## roll time scale values, re-compute interval lengths (lex.dur) --------- - - tmp_ie <- makeTempVarName(names = names(l), pre = "TEMP_INT_END_") - l[, (tmp_ie) := shift(.SD, n = 1, type = "lead"), - .SDcols = s1, by = "lex.id"] - is_last_row <- is.na(l[[tmp_ie]]) - - l[is_last_row, (tmp_ie) := lex.dur + .SD, .SDcols = s1] - - set(l, j = "lex.dur", value = l[[tmp_ie]] - l[[s1]]) - set(l, j = tmp_ie, value = NULL) - } - - has_zero_dur <- l[["lex.dur"]] < .Machine$double.eps^0.5 - if (any(has_zero_dur)) { - l <- l[!has_zero_dur, ] - } - - ## ensure statuses are as expected ----------------------------------------- - - - setkeyv(l, c("lex.id", s1)) - roll_lexis_status_inplace( - unsplit.data = dt, split.data = l, id.var = "lex.id" - ) - - ## dt$lex.id from temporary values to original values ---------------------- - # merge in correct IDs also to split data - on.exit() - set(dt, j = "lex.id", value = id_dt$lex.id) - - - tmpID <- makeTempVarName(names = names(l), pre = "TEMP_SPLITMULTI_ID_") - setnames(l, old = "lex.id", new = tmpID) - set(l, j = "lex.id", value = {id_dt[ - i = .(l[[tmpID]]), - j = .SD, - on = "temp_id_values", - .SDcols = "orig_id_values" - ]}) - set(l, j = tmpID, value = NULL) - rm("id_dt") - - } - - if (drop) l <- intelliDrop(l, breaks = breaks, dropNegDur = FALSE) - - if (nrow(l) == 0) { - warning("no data left after dropping; check breaks?") - } - - order <- c("lex.id", "lex.multi", allScales, "lex.dur", "lex.Cst", "lex.Xst") - order <- c(order, setdiff(names(l), order)) - order <- intersect(order, names(l)) - setcolorder(l, order) - - if (verbose) cat("time taken by splitting process: ", timetaken(stime), "\n") - - - breaks <- lapply(allScales, function(scale_nm) { - ## allowed to NULL also - br <- c(breaks[[scale_nm]], oldBreaks[[scale_nm]]) - if (is.null(br)) return(br) - sort(unique(br)) - }) - names(breaks) <- allScales - - setattr(l, "time.scales", allScales) - setattr(l, "time.since", attr_list[["time.since"]]) - setattr(l, "breaks", breaks) - setattr(l, "class", c("Lexis","data.table","data.frame")) - if (!return_DT()) setDFpe(l) - - l[] - -} - -globalVariables(".") +#' @title Split case-level observations +#' @author Joonas Miettinen +#' @description Split a \code{Lexis} object along multiple time scales +#' with speed and ease +#' @param data a Lexis object with event cases as rows +#' @param breaks a list of named numeric vectors of breaks; see Details and Examples +#' @param ... alternate way of supplying breaks as named vectors; +#' e.g. \code{fot = 0:5} instead of \code{breaks = list(fot = 0:5)}; +#' if \code{breaks} is not \code{NULL}, \code{breaks} is used and any breaks +#' passed through \code{...} are NOT used +#' @param drop logical; if \code{TRUE}, drops all resulting rows +#' after expansion that reside outside the time window +#' defined by the given breaks +#' @param merge logical; if \code{TRUE}, retains all variables +#' from the original data - i.e. original variables are +#' repeated for all the rows by original subject +#' @param verbose logical; if \code{TRUE}, the function is chatty +#' and returns some messages along the way +#' +#' +#' @details +#' +#' \code{splitMulti} is in essence a \pkg{data.table} version of +#' \code{splitLexis} or \code{survSplit} for splitting along multiple +#' time scales. +#' It requires a Lexis object as input. +#' +#' The \code{breaks} must be a list of named vectors of the appropriate type. +#' The breaks are fully explicit and +#' left-inclusive and right exclusive, e.g. \code{fot=c(0,5)} +#' forces the data to only include time between +#' \code{[0,5)} for each original row (unless \code{drop = FALSE}). +#' Use \code{Inf} or \code{-Inf} for open-ended intervals, +#' e.g. \code{per=c(1990,1995,Inf)} creates the intervals +#' \code{[1990,1995), [1995, Inf)}. +#' +#' Instead of specifying \code{breaks}, one may make use of the \code{...} +#' argument to pass breaks: e.g. +#' +#' \code{splitMulti(x, breaks = list(fot = 0:5))} +#' +#' is equivalent to +#' +#' \code{splitMulti(x, fot = 0:5)}. +#' +#' Multiple breaks can be supplied in the same manner. However, if both +#' \code{breaks} and \code{...} are used, only the breaks in \code{breaks} +#' are utilized within the function. +#' +#' The \code{Lexis} time scale variables can be of any arbitrary +#' format, e.g. \code{Date}, +#' fractional years (see \code{\link[Epi]{cal.yr}}) and \code{\link{get.yrs}}, +#' or other. However, using \code{date} variables (from package \pkg{date}) +#' are not recommended, as \code{date} variables are always stored as integers, +#' whereas \code{Date} variables (see \code{?as.Date}) are typically stored +#' in double ("numeric") format. This allows for breaking days into fractions +#' as well, when using e.g. hypothetical years of 365.25 days. +#' +#' @return +#' A \code{data.table} or \code{data.frame} +#' (depending on \code{options("popEpi.datatable")}; see \code{?popEpi}) +#' object expanded to accommodate split observations. +#' +#' @examples +#' #### let's prepare data for computing period method survivals +#' #### in case there are problems with dates, we first +#' #### convert to fractional years. +#' \dontrun{ +#' library(Epi) +#' data("sire", package = "popEpi") +#' x <- Lexis(data=sire, entry = list(fot=0, per=get.yrs(dg_date), age=dg_age), +#' exit=list(per=get.yrs(ex_date)), exit.status=status) +#' x2 <- splitMulti(x, breaks = list(fot=seq(0, 5, by = 3/12), per=c(2008, 2013))) +#' # equivalently: +#' x2 <- splitMulti(x, fot=seq(0, 5, by = 3/12), per=c(2008, 2013)) +#' +#' ## using dates; note: breaks must be expressed as dates or days! +#' x <- Lexis(data=sire, entry = list(fot=0, per=dg_date, age=dg_date-bi_date), +#' exit=list(per=ex_date), exit.status=status) +#' BL <- list(fot = seq(0, 5, by = 3/12)*365.242199, +#' per = as.Date(paste0(c(1980:2014),"-01-01")), +#' age = c(0,45,85,Inf)*365.242199) +#' x2 <- splitMulti(x, breaks = BL, verbose=TRUE) +#' +#' +#' ## multistate example (healty - sick - dead) +#' sire2 <- data.frame(sire) +#' +#' set.seed(1L) +#' not_sick <- sample.int(nrow(sire2), 6000L, replace = FALSE) +#' sire2[not_sick, ]$dg_date <- NA +#' sire2[!is.na(sire2$dg_date) & sire2$status == 0, ]$status <- -1 +#' +#' sire2$status[sire2$status==2] <- 1 +#' sire2$status <- factor(sire2$status, levels = c(0, -1, 1), +#' labels = c("healthy", "sick", "dead")) +#' +#' xm <- Lexis(data=sire2, entry = list(fot=0, per=get.yrs(bi_date), age=0), +#' exit=list(per=get.yrs(ex_date)), exit.status=status) +#' xm2 <- cutLexis(xm, cut = get.yrs(xm$dg_date), +#' timescale = "per", +#' new.state = "sick") +#' xm2[xm2$lex.id == 6L, ] +#' +#' xm2 <- splitMulti(xm2, breaks = list(fot = seq(0,150,25))) +#' xm2[xm2$lex.id == 6L, ] +#' } +#' +#' @import data.table +#' @import Epi +#' +#' @export +#' @family splitting functions +#' @seealso +#' \code{\link[Epi]{splitLexis}}, \code{\link[Epi]{Lexis}}, +#' \code{\link[survival]{survSplit}} +#' +splitMulti <- function(data, + breaks = NULL, + ..., + drop=TRUE, + merge=TRUE, + verbose=FALSE) { + + lex.id <- lex.dur <- NULL ## APPEASE R CMD CHECK + + ## basic checks -------------------------------------------------------------- + if (verbose) {stime <- proc.time()} + + breaks <- splitMultiPreCheck(data = data, breaks = breaks, ...) + + ## collect necessary data ---------------------------------------------------- + attr_list <- copy(attributes(data)[c("time.scales", "breaks", "time.since")]) + allScales <- attr_list$time.scales + splitScales <- names(breaks) + + keep_nms <- if (merge) names(data) else { + intersect( + names(data), + c("lex.id", "lex.Cst", "lex.Xst", allScales) + ) + } + # this is not a copy! + dt <- mget_cols(keep_nms, data = data) + forceLexisDT(dt, breaks = attr(data, "breaks"), allScales = allScales, + key = FALSE) + + ## check if even need to do splitting ---------------------------------------- + + oldBreaks <- copy(attr(data, "breaks")) + tryCatch(checkBreaksList(data, oldBreaks), error = function(e) { + stop("Error in splitMulti: \n", + "Old breaks existing in Lexis data did not pass testing. Error ", + "message from test: \n", e, call. = FALSE) + }) + + ## only do split if all breaks are NOT in the breaks that the data + ## has already been split by. + do_split <- TRUE + do_split <- !all_breaks_in(breaks, oldBreaks, x = data) + + if (!do_split) { + l <- setDT(copy(dt)) + setkeyv(l, c("lex.id", allScales[1])) + } else { + + ## temp IDS ---------------------------------------------------------------- + # used to ensure correct splitting and lex status rolling + + id_dt <- data.table( + orig_id_values = dt$lex.id, + temp_id_values = 1:nrow(dt), + key = "temp_id_values" + ) + + on.exit(set(dt, j = "lex.id", value = id_dt[["orig_id_values"]])) + set(dt, j = "lex.id", value = id_dt[["temp_id_values"]]) + + l <- vector(mode = "list", length = length(splitScales)) + setattr(l, "names", splitScales) + for (v in splitScales) { + l[[v]] <- splitLexisDT(dt, breaks = breaks[[v]], + merge = merge, drop = FALSE, timeScale = v) + breaks[[v]] <- attr(l[[v]], "breaks")[[v]] + } + l <- rbindlist(l) + + s1 <- allScales[1] + setkeyv(l, c("lex.id", s1)) + + if (length(splitScales) > 1L) { + ## roll time scale values, re-compute interval lengths (lex.dur) --------- + + tmp_ie <- makeTempVarName(names = names(l), pre = "TEMP_INT_END_") + l[, (tmp_ie) := shift(.SD, n = 1, type = "lead"), + .SDcols = s1, by = "lex.id"] + is_last_row <- is.na(l[[tmp_ie]]) + + l[is_last_row, (tmp_ie) := lex.dur + .SD, .SDcols = s1] + + set(l, j = "lex.dur", value = l[[tmp_ie]] - l[[s1]]) + set(l, j = tmp_ie, value = NULL) + } + + has_zero_dur <- l[["lex.dur"]] < .Machine$double.eps^0.5 + if (any(has_zero_dur)) { + l <- l[!has_zero_dur, ] + } + + ## ensure statuses are as expected ----------------------------------------- + + + setkeyv(l, c("lex.id", s1)) + roll_lexis_status_inplace( + unsplit.data = dt, split.data = l, id.var = "lex.id" + ) + + ## dt$lex.id from temporary values to original values ---------------------- + # merge in correct IDs also to split data + on.exit() + set(dt, j = "lex.id", value = id_dt$lex.id) + + + tmpID <- makeTempVarName(names = names(l), pre = "TEMP_SPLITMULTI_ID_") + setnames(l, old = "lex.id", new = tmpID) + set(l, j = "lex.id", value = {id_dt[ + i = .(l[[tmpID]]), + j = .SD, + on = "temp_id_values", + .SDcols = "orig_id_values" + ]}) + set(l, j = tmpID, value = NULL) + rm("id_dt") + + } + + if (drop) l <- intelliDrop(l, breaks = breaks, dropNegDur = FALSE) + + if (nrow(l) == 0) { + warning("no data left after dropping; check breaks?") + } + + order <- c("lex.id", "lex.multi", allScales, "lex.dur", "lex.Cst", "lex.Xst") + order <- c(order, setdiff(names(l), order)) + order <- intersect(order, names(l)) + setcolorder(l, order) + + if (verbose) cat("time taken by splitting process: ", timetaken(stime), "\n") + + + breaks <- lapply(allScales, function(scale_nm) { + ## allowed to NULL also + br <- c(breaks[[scale_nm]], oldBreaks[[scale_nm]]) + if (is.null(br)) return(br) + sort(unique(br)) + }) + names(breaks) <- allScales + + setattr(l, "time.scales", allScales) + setattr(l, "time.since", attr_list[["time.since"]]) + setattr(l, "breaks", breaks) + setattr(l, "class", c("Lexis","data.table","data.frame")) + if (!return_DT()) setDFpe(l) + + l[] + +} + +globalVariables(".") diff --git a/R/splitting_utility_functions.R b/R/splitting_utility_functions.R index e0272b8..9eb1247 100644 --- a/R/splitting_utility_functions.R +++ b/R/splitting_utility_functions.R @@ -1,1426 +1,1426 @@ -all_breaks_in <- function(bl1, bl2, x = NULL) { - ## INTENTION: return TRUE/FALSE depending on whether bl1 is a subset of bl2; - ## this means that each element in bl1 exists in bl2, and that those elements - ## are each subsets of the corresponding elements in bl2. - ## this is handy to check whether the some Lexis data has already - ## been split using the breaks in bl1. - ## NOTE: use checkBreakList() on each list separately before this. - - if (!is.list(bl1) || !is.list(bl2)) { - stop("Arguments bl1 and bl2 must be lists of breaks as supplied to e.g. ", - "splitMulti.") - } - - if (inherits(x, "Lexis")) { - checkLexisData(x) - checkBreaksList(x, bl1) - checkBreaksList(x, bl2) - } - - ce <- intersect(names(bl1), names(bl2)) - if (length(ce) != length(bl1)) return(FALSE) - - test <- mapply(function(l1, l2) { - all(l1 %in% l2) - }, l1 = bl1, l2 = bl2[ce], SIMPLIFY = FALSE) - - all(unlist(test)) -} - - -checkBreaksList <- function(x, breaks = list(fot = 0:5)) { - if (is.null(breaks)) stop("breaks is NULL") - if (!is.list(breaks)) stop("breaks needs to be a list") - if (!is.data.frame(x)) stop("x needs to be a data.frame") - timeScales <- names(breaks) - if (length(breaks) == 0L) stop("length of breaks list is zero") - if (length(timeScales) != length(breaks)) stop("breaks needs to be a fully named list") - - bad_scales <- setdiff(timeScales, names(x)) - if (length(bad_scales) > 0) { - stop("at least one breaks list name wasn't a variable in data; bad names: ", - paste0("'", bad_scales, "'", collapse = ", ")) - } - lens <- lapply(breaks, function(el) if (is.null(el)) -1 else length(el)) - badLens <- names(lens[unlist(lens) == 0L]) - if (length(badLens)) { - badLens <- paste0("'", badLens, "'", collapse = ", ") - stop("Elements in breaks list for the following time scales were of ", - "length zero but not NULL: ", badLens, ". Breaks list may only ", - "contain elements of length > 0 or elements that are NULL.") - } - invisible(NULL) -} - -checkPophaz <- function(lex, ph, haz.name = "haz") { - ## INTENTION: checks a Lexis data set against the pophaz data set for - ## consistency (e.g. existing variables to merge by) - - if (!is.data.frame(ph)) { - stop("Data set containing population/expected hazards must be a data.frame", - " (or a data.table, which is also a data.frame).") - } - - if (!haz.name %in% names(ph)) { - stop("Data set containing population/expected hazards does not contain a ", - "column named 'haz'. Make sure the name is exactly that (", - "case sensitive).") - } - - if (haz.name %in% names(lex)) { - stop("Lexis data set already contains a column named 'haz', which is a ", - "reserved name for the population hazard variable to be merged. ", - "Please rename/delete 'haz' from/in your Lexis data first.") - } - - if (!is.data.frame(ph)) { - stop("Data set of expected/population hazards must be a data.frame.") - } - - bn <- setdiff(names(ph), haz.name) - - if (length(bn) == 0L) { - stop("No variables in expected/population hazards data set to use in merge ", - "with Lexis data. Ensure that the pop. haz. data set containts some ", - "variables to merge by (e.g. sex, calendar year, and age group)") - } - if (!all(bn %in% names(lex))) { - badbn <- paste0("'", setdiff(bn, names(lex)), "'", collapse = ", ") - stop("Lexis data set did not have following variable(s) that were in ", - "the expected/population hazards data set: ", badbn,". ", - "Ensure you have supplied the right data and that the names of the ", - "intended variables match.") - } - - mergeVars <- setdiff(names(ph), haz.name) - dup <- any(duplicated(as.data.table(ph), by = mergeVars)) - if (dup) { - stop("Supplied data set of population/expected hzards has duplicated rows ", - "by the variables ", paste0("'",mergeVars, "'", collapse = ", "), - " which prevents correct usage of the data set. Please ensure no rows", - " area duplicated in the data set before proceeding. Tip: use e.g. ", - "duplicated(PH, by = c('V1', 'V2')) to check for duplicatedness in ", - "your data set (here named PH) by the variables V1 and V2." - ) - } - - invisible() -} - - - - - -intelliCrop <- function( - x, - breaks = list(fot = 0:5), - allScales = NULL, - cropStatuses = FALSE, - tol = .Machine$double.eps^0.5 -) { - - ## appease R CMD CHECK - lex.dur <- lex.Xst <- lex.Cst <- NULL - - checkBreaksList(x = x, breaks = breaks) - breaks[unlist(lapply(breaks, length)) == 0L] <- NULL - if (!is.data.table(x)) stop("x needs to be a data.table") - - cropScales <- names(breaks) - - all_names_present(x, c("lex.dur", allScales)) - - if (cropStatuses) { - origEnd <- x$lex.dur + x[[allScales[1L]]] - } - - - deltas <- mapply(function(b, y) pmax(min(b), y) - y, SIMPLIFY = FALSE, - b = breaks, y = mget_cols(cropScales, x)) - ## below: baseline (zero value without assigning zero of bad class) - deltas <- c(deltas, list(x[[cropScales[1]]][1L] - x[[cropScales[1]]][1L])) - deltas <- do.call(pmax, deltas) - - set(x, j = allScales, value = mget_cols(allScales, x) + deltas) - set(x, j = "lex.dur", value = x[["lex.dur"]] - deltas) - - durs <- mapply(function(b, y) max(b) - y, SIMPLIFY = FALSE, - b = breaks, y = mget_cols(cropScales, x)) - durs$lex.dur <- x$lex.dur - durs <- do.call(pmin, durs) - ## now have max durs by row, i.e. up to roof of breaks at most, - ## or to ((original lex.dur) - (deltas)) if that is smaller. - ## (being cropped or exiting before roof of breaks) - - set(x, j = "lex.dur", value = durs) - - if (cropStatuses) { - harmonizeStatuses(x, C = "lex.Cst", X = "lex.Xst") - wh_was_cropped <- which(x[["lex.dur"]] + x[[allScales[1L]]] + tol < origEnd) - set(x, i = wh_was_cropped, j = "lex.Xst", - value = x[["lex.Cst"]][wh_was_cropped]) - } - - invisible(x) -} - - - - - -harmonizeStatuses <- function(x, C = "lex.Cst", X = "lex.Xst") { - - clC <- class(x[[C]]) - clX <- class(x[[X]]) - tyC <- typeof(x[[C]]) - tyX <- typeof(x[[X]]) - cl <- c(clC, clX) - - if (tyC != tyX && clC != clX) { - if (is.numeric(x[[C]]) && is.numeric(x[[X]])) { - harmonizeNumeric(x = x, v1="lex.Cst", v2="lex.Xst") - - } else if (is.factor(x[[C]]) || is.factor(x[[X]])) { - if (!is.factor(x[[C]])) set(x, j = C, value = as.factor(x[[C]])) - if (!is.factor(x[[X]])) set(x, j = X, value = as.factor(x[[X]])) - - } - } - - if (any(cl == "factor")) { - harmonizeFactors(x = x, v1="lex.Cst", v2="lex.Xst") - } - -} - -harmonizeNumericTimeScales <- function(x, times = NULL) { - ## INTENTION: given a Lexis data set with some time scales, ensure - ## that the classes of the time scales comply to the lowest denominator, - ## e.g. "double" and "integer" -> both "double" - - if (is.null(times)) { - times <- c(attr(x, "time.scales"), "lex.dur") - } - - msg <- paste0("Expected working data to have time scales %%VARS%%, but it ", - "didn't. This is an internal error: If you see this, complain ", - "to the package maintainer.") - all_names_present(x, times, msg = msg) - xt <- lapply(times, function(ch) x[[ch]]) - names(xt) <- times - - harmoClasses <- c("numeric", "integer", "difftime") - cl <- lapply(xt, class) - wh <- unlist(lapply(cl, function(ch) { - any(ch %in% harmoClasses) - })) - ha <- times[wh] - hacl <- unique(unlist(cl[wh])) - - if (length(ha) > 1L) { - ## more than one class present and need to use common lowest denom - newMode <- as.double - - if (all(ha %in% c("integer", "difftime"))) { - ## all numeric times are integers or difftimes - newMode <- as.integer - } - for (var in ha) { - ## modify in place - set(x, j = var, value = newMode(x[[var]])) - } - - - } - invisible(NULL) -} - - - - - -harmonizeNumeric <- function(x, v1="lex.Cst", v2="lex.Xst") { - ## assumes v1, v2 are numeric variable names in x - - if (!is.numeric(x[[v1]]) || !is.numeric(x[[v2]])) { - print(class(x[[v1]])) - print(class(x[[v2]])) - stop("v1 and/or v2 is/are not of class numeric") - } - - if (!is.integer(x[[v1]])) set(x, j = v1, value = try2int(x[[v1]])) - if (!is.integer(x[[v2]])) set(x, j = v2, value = try2int(x[[v2]])) - - if (typeof(x[[v1]]) != typeof(x[[v2]])) { - - if (is.double(x[[v1]])) set(x, j = v1, value = as.double(x[[v1]])) - if (is.double(x[[v2]])) set(x, j = v2, value = as.double(x[[v2]])) - - } - -} - - - - - -harmonizeFactors <- function(x, v1="lex.Cst", v2="lex.Xst") { - ## assumes v1, v2 are factor names in x - - if (!is.factor(x[[v1]]) || !is.factor(x[[v2]])) { - stop("v1 and/or v2 is/are not of class factor") - } - - glab1 <- union(levels(x[[v1]]), levels(x[[v2]])) - glab2 <- union(levels(x[[v2]]), levels(x[[v1]])) - - - - setattr(x[[v1]], "levels", glab1) - setattr(x[[v2]], "levels", glab2) - -} - - - - - -intelliDrop <- function(x, breaks = list(fot = 0:5), dropNegDur = TRUE, check = FALSE, tol = .Machine$double.eps^0.5, subset = NULL) { - - if (!is.data.table(x)) { - stop("x needs to be a data.table; if you see this message, complain ", - "to the package maintainer") - } - checkBreaksList(x = x, breaks = breaks) - breaks[unlist(lapply(breaks, length)) < 2] <- NULL - timeScales <- names(breaks) - - if (check) { - checkLexisData(x) - } - - ra <- lapply(breaks, range) - ra <- lapply(ra, diff) - ts <- names(sort(unlist(ra))) ## shortest first - mi <- lapply(breaks, min) - ma <- lapply(breaks, max) - - substi <- substitute(subset) - subset <- evalLogicalSubset(x, substiset = substi) - - if (dropNegDur) subset[subset] <- subset[subset] & x$lex.dur[subset] > 0L - - ## figure out latest exit and first entry; don't need to test for dropping - ## if e.g. all left follow-up before the max in breaks - max_end <- lapply(ts, function(ch) min(x[[ch]] + x$lex.dur)) - min_start <- lapply(ts, function(ch) max(x[[ch]])) - names(max_end) <- names(min_start) <- ts - - for (k in ts) { - mik <- mi[[k]] - mak <- ma[[k]] - - if (max_end[[k]] < mak + tol) { - tmpSD <- x[subset, .SD, .SDcols = c(k, "lex.dur")] - tmpSD <- setDT(lapply(tmpSD, as.numeric)) - subset[subset] <- rowSums(tmpSD) <= mak + tol - } - if (min_start[[k]] + tol > mik) { - subset[subset] <- x[subset,][[k]] > mik - tol - } - - if (all(!subset)) { - stop("Dropped all remaining rows from data when subsetting by the ", - "Lexis time scale '", k, "'. Range of values in data: ", - paste0(round(range(x[[k]]),4), collapse = "-"), ". Min/Max breaks ", - "(used to subset data): ", mik, "/", mak, ".") - } - - } - - - x[subset, ] -} - - -matchBreakTypes <- function(lex, breaks, timeScale, modify.lex = FALSE) { - if (is.character(breaks)) { - breaks <- as.IDate(breaks) - } - clb <- class(breaks) - clb <- clb[length(clb)] - cts <- class(lex[[timeScale]]) - cts <- cts[length(cts)] - - if (clb != cts) { - if (is.Date(breaks) && !is.Date(lex[[timeScale]])) { - breaks <- try2int(as.double(breaks)) - } else if (is.integer(breaks) && is.double(lex[[timeScale]])) { - breaks <- as.double(breaks) - } else if (is.double(breaks) && is.integer(lex[[timeScale]])) { - breaks <- try2int(breaks) - } - - } - - if (modify.lex && clb != cts) { - if (!is.Date(breaks) && is.Date(lex[[timeScale]])) { - - if (clb == "double") { - set(lex, j = timeScale, value = as.double(lex[[timeScale]])) - } else { - set(lex, j = timeScale, value = as.integer(lex[[timeScale]])) - } - - } else if (is.double(breaks) && is.integer(lex[[timeScale]])) { - set(lex, j = timeScale, value = as.double(lex[[timeScale]])) - } - - } - breaks -} - -protectFromDrop <- function(breaks, lower = FALSE) { - old_breaks <- copy(breaks) - if (length(breaks) == 0L) { - stop("Length of breaks to 'protect' from dropping is zero.") - } - if (is.Date(breaks)) { - breaks <- c(breaks, max(breaks) + 1e4L) - if (lower) breaks <- c(min(breaks) - 1e4L, breaks) - - } else if (is.integer(breaks)) { - breaks <- c(breaks, 1e6L) - if (lower) breaks <- c(-1e6L, breaks) - - } else if (is.double(breaks)) { - breaks <- c(breaks, Inf) - if (lower) breaks <- c(-Inf, breaks) - - } else { - stop("breaks were not Date, integer or double") - } - setattr(breaks, "unprotected", old_breaks) - breaks -} - -unprotectFromDrop <- function(breaks) { - up <- attr(breaks, "unprotected") - if (is.null(up) || length(up) == 0L) { - stop("Could not 'unprotect' breaks from dropping as the required ", - "attribute was not found. If you see this it is most likely ", - "an internal error and you should complain to the pkg maintainer.") - } - up -} - - - - -setLexisDT <- function(data, entry, exit, entry.status, exit.status, id = NULL, select = NULL) { - - ## appease R CMD CHECK - lex.Cst <- lex.Xst <- NULL - - if (!is.data.table(data)) stop("not a data.table") - if (inherits(data, "Lexis")) stop("already a Lexis object") - - if (!is.null(select) && !is.character(select)) stop("select was not a character vector of names") - - entry <- substitute(entry) - exit <- substitute(exit) - entry <- eval(entry, envir = data, enclos = parent.frame()) - exit <- eval(exit, envir = data, enclos = parent.frame()) - enNames <- names(entry) - exNames <- names(exit) - - timeScales <- union(enNames, exNames) - if (any(timeScales %in% names(data))) stop("at least one named time scales already present in data; original names mandatory") - enNeeded <- setdiff(timeScales, enNames) - enPresent <- setdiff(timeScales, enNeeded) - durVar <- intersect(enNames, exNames) - if (length(durVar) > 1) stop("you have more than 1 time scales in both entry and exit; only one mandatory") - - enVars <- paste0(enNames, "_en") - exVars <- paste0(exNames, "_ex") - setattr(entry, "names", enVars) - setattr(exit, "names", exVars) - - l <- as.data.table(c(entry, exit)) - rm(entry, exit) - - ## duration - exV <- paste0(durVar, "_ex") - enV <- paste0(durVar, "_en") - set(l, j = "lex.dur", value = l[[exV]] - l[[enV]]) - rm(exV, enV) - - ## time scale starting points - if (length(enNeeded) > 0) { - for (ts in enNeeded) { - exV <- paste0(ts, "_ex") - set(l, j = ts, value = l[[exV]] - l$lex.dur) - } - } - setnames(l, paste0(enPresent, "_en"), enPresent) - - # no longer need time scale end points - for (k in exVars) { - set(l, j = k, value = NULL) - } - - ## status definition - data[, lex.Cst := entry.status] - data[, lex.Xst := exit.status] - - harmonizeStatuses(data, C = "lex.Cst", X = "lex.Xst") - - ## all time scales etc. into data - data[, names(l) := l] - - - id <- substitute(id) - id <- eval(id, envir = data, enclos = parent.frame()) - if (!is.null(id)) set(data, j = "lex.id", value = id) - rm(id) - - if (!is.null(select)) { - - delVars <- setdiff(names(data), c(names(l), select)) - if (length(delVars) > 0) { - l[, (delVars) := NULL] - } - } - - rm(l) - lexVars <- c("lex.id", timeScales, "lex.dur", "lex.Cst", "lex.Xst") - setcolorder(data, c(lexVars, setdiff(names(data), lexVars))) - - setattr(data, "time.scales", timeScales) - setattr(data, "time.since", rep("", times = length(timeScales))) - setattr(data, "class", c("Lexis", "data.table", "data.frame")) - - -} - -checkLexisData <- function(lex, check.breaks = FALSE) { - ## INTENTION: checks Lexis attributes - ## OUTPUT: nothing - - if (is.null(lex) || nrow(lex) == 0) stop("Data is NULL or has zero rows") - if (!inherits(lex, "Lexis")) stop("Data not a Lexis object") - allScales <- attr(lex, "time.scales") - if (length(allScales) == 0) stop("no time scales appear to be defined; is data a Lexis object?") - - badScales <- setdiff(allScales, names(lex)) - if (length(badScales) > 0) { - badScales <- paste0("'", badScales, "'", collapse = ", ") - stop("Following time scales found in data's attributes but not present in data: ", badScales) - } - - lexVars <- c("lex.dur", "lex.id", "lex.Cst", "lex.Xst") - blv <- setdiff(lexVars, names(lex)) - if (length(blv) > 0) { - blv <- paste0("'", blv, "'", collapse = ", ") - stop("Following Lexis variables not found in data: ", blv) - } - - if (check.breaks) { - BL <- attr(lex, "breaks") - if (is.null(BL)) stop("No breaks list in data attributes") - checkBreaksList(lex, breaks = BL) - } - - invisible() -} - - -splitMultiPreCheck <- function(data = NULL, breaks = NULL, ...) { - - ## INTENTION: checks for discrepancies between data and breaks, etc. - ## OUTPUT: cleaned-up list of breaks - checkLexisData(data) - allScales <- attr(data, "time.scales") - - if (!is.null(breaks) && !is.list(breaks)) stop("breaks must be a list; see examples in ?splitMulti") - if (is.null(breaks)) { - breaks <- list(...) - breaks <- breaks[intersect(names(breaks), allScales)] - } - - if (length(breaks) == 0) stop("no breaks defined!") - - splitScales <- names(breaks) - ## NULL breaks imply not used - for (k in splitScales) { - if (length(breaks[[k]]) == 0) { - breaks[k] <- NULL - } - } - - checkBreaksList(x = data, breaks = breaks) - - splitScales <- names(breaks) - - if (!all(splitScales %in% allScales)) { - stop("breaks must be a list with at least one named vector corresponding to used time scales \n - e.g. breaks = list(fot = 0:5)") - } - - if (!all(splitScales %in% names(data))) { - stop("At least one vector name in breaks list is not a variable name in the data") - } - breaks -} - -forceLexisDT <- function(x, breaks = NULL, allScales = NULL, key = TRUE) { - setattr(x, "class", c("Lexis", "data.table", "data.frame")) - setattr(x, "breaks", breaks) - setattr(x, "time.scales", allScales) - # alloc.col(x) - if (key) setkeyv(x, c("lex.id", names(breaks)[1L])) - invisible(x) -} - - -doCutLexisDT <- function(lex, cut = dg_date, timeScale = "per", by = "lex.id", n = 1L) { - - checkLexisData(lex, check.breaks = FALSE) - - x <- unique(lex, by = by) - cut <- evalq(cut, envir = lex, enclos = parent.frame(n = n + 1L)) - delta <- cut - x[[timeScale]] - - allScales <- attr(lex, "time.scales") - - setDT(x) - for (v in allScales) { - set(x, j = v, value = x[[v]] + delta) - } - - set(x, j = "lex.dur", value = 0) - - tmp <- list() - tmp$isCut <- makeTempVarName(lex, pre = "isCut_") - - set(x, j = tmp$isCut, value = 1L) - on.exit(setcolsnull(x, unlist(tmp))) - - x <- rbindlist(list(lex, x), use.names = TRUE, fill = TRUE) - x[1:nrow(lex), (tmp$isCut) := 0L] - - ## NOTE: new cut row being the first or last row - ## implies it resides outside old observations - ## OR it is equal to lowest/highest value - setkeyv(x, c(by, allScales)) - setkeyv(x, by) - x <- x[!((duplicated(x, by = key(x)) | duplicated(x, by = key(x), fromLast = TRUE)) & x[[tmp$isCut]] == 0L)] - stop("not ready") -} - -# data <- data.table(birth = 2000:2000, entry=2002:2003, -# exit=2011:2012, event=c(2010,2011), -# status=1:0) -# -# lex <- lexpand(data = data, birth = birth, entry = entry, -# exit = exit, event = event, -# id = 1L, entry.status = 99L, -# status = status, overlapping = T) - -lexpile <- function(lex, by = "lex.id", subset = NULL) { - ## PURPOSE: given several rows per id in a Lexis object, - ## collate data into form where - ## - no subject has any overlapping time lines - ## - lex.Cst and lex.Xst are logical, i.e. 0 -> 1, 1 -> 1, 1 -> 2 - ## this should be made to work with both split and unsplit Lexis data. - - data <- NULL # R CMD CHECK appeasement - - checkLexisData(lex, check.breaks = FALSE) - - allScales <- attr(lex, "time.scales") - sc <- allScales[1L] - - all_names_present(lex, by) - - if (is.character(lex$lex.Cst) || is.character(lex$lex.Xst)) { - stop("This function requires lex.Cst and lex.Xst to be integer, double (i.e. numeric) or factor variables to determine the order of possible statuses!") - } - - ## need to take copy eventually ---------------------------------------------- - attrs <- attributes(lex) - subset <- evalLogicalSubset(data, substitute(subset)) - x <- lex[subset,] - forceLexisDT(x, breaks = attrs$breaks, allScales = attrs$time.scales) - alloc.col(x) - - - ## ensure status harmony ----------------------------------------------------- - harmonizeStatuses(x = x, X = "lex.Xst", C = "lex.Cst") - exStat <- if (is.factor(x$lex.Xst)) levels(x$lex.Xst) else sort(unique(x$lex.Xst)) - enStat <- if (is.factor(x$lex.Cst)) levels(x$lex.Cst) else sort(unique(x$lex.Cst)) - allStat <- c(setdiff(enStat, exStat), exStat) ## enStat & exStat equal if factors used - - ## avoiding side effects ----------------------------------------------------- - oldKey <- key(lex) - tmp <- list() - tmp$order<- makeTempVarName(x, pre = "order_") - - on.exit({ - if (length(oldKey) > 0) setkeyv(x, oldKey) else - setorderv(x, tmp$order) - }, add = TRUE) - - on.exit({ - setcolsnull(x, unlist(tmp$order), soft = TRUE) - }, add = TRUE) - - x[, c(tmp$order) := 1:.N] - - ## check for need for lexpiling ---------------------------------------------- - setkeyv(x, by) - if (sum(duplicated(x, by = key(x))) == 0L) return(lex) - - - ## figure out what statuses are used ----------------------------------------- - - tmp$ev <- makeTempVarName(x, pre = "event_") - x[, c(tmp$ev) := detectEvents(x, breaks = attrs$breaks, by = by)] - - tmp$scEnds <- paste0(allScales, "_end") - tmp$scEnds <- makeTempVarName(lex, pre = tmp$scEnds) - x[, c(tmp$scEnds) := lapply(.SD, function(x) x + lex$lex.dur), .SDcols = allScales] - - ## NOTE: rows for a given subject ending in simultaneously with at least - ## one being a transition will not be allowed. - setkeyv(x, c(by, tmp$scEnds[1L])) - - whDup <- duplicated(x, fromLast = FALSE, by = key(x)) | duplicated(x, fromLast = TRUE, by = key(x)) - dupTest <- x[whDup, 1L %in% unique(.SD), .SDcols = tmp$ev] - rm(whDup) - - if (dupTest) stop("At least one subject had at least two simultaneous events.") - ## NOTE: if interval ends AND status are the very same for M rows, - ## then the M rows are necessarily nested with one or more covering - ## the whole time line. Only need to keep the one. - setkeyv(x, c(tmp$scEnds, "lex.Cst", "lex.Xst")) - setorderv(x, c(allScales,tmp$scEnds, "lex.Cst", "lex.Xst")) - - x <- unique(x, by = key(x)) - stop("unfinished") - -} - -contractLexis <- function(x, breaks, drop = TRUE) { - stop("This doesnt do anything yet") - ## INTENTION: given a Lexis object and breaks, - ## ensures data is split by the breaks and contracts the split rows - ## so that the data is split at the level of the supplied breaks. - ## e.g. with x split by fot = seq(0, 5, 1/12) and with supplying - ## breaks = list(fot = 0:5), rows within 0-1 are collated into one row etc. - - ## PROBLEM: a subject may have e.g. rows spaning 0.5 - 1 which are requested - ## to be contracted to one row spanning 0-1. - - -} - - - - -#' @title Prepare Exposure Data for Aggregation -#' @description \code{prepExpo} uses a \code{Lexis} object of periods of exposure -#' to fill gaps between the periods and overall entry and exit times without -#' accumulating exposure time in periods of no exposure, and splits the -#' result if requested. -#' @param lex a \code{\link[Epi]{Lexis}} object with ONLY periods of exposure -#' as rows; one or multiple rows per subject allowed -#' @param freezeScales a character vector naming \code{Lexis} time scales of exposure -#' which should be frozen in periods where no exposure occurs (in the gap -#' time periods) -#' @param cutScale the \code{Lexis} time scale along which the subject-specific -#' ultimate entry and exit times are specified -#' @param entry an expression; the time of entry to follow-up which may be earlier, at, or after -#' the first time of exposure in \code{freezeScales}; evaluated separately -#' for each unique combination of \code{by}, so e.g. with -#' \code{entry = min(Var1)} and \code{by = "lex.id"} it -#' sets the \code{lex.id}-specific minima of \code{Var1} to be the original times -#' of entry for each \code{lex.id} -#' @param exit the same as \code{entry} but for the ultimate exit time per unique -#' combination of \code{by} -#' @param by a character vector indicating variable names in \code{lex}, -#' the unique combinations of which identify separate subjects for which -#' to fill gaps in the records from \code{entry} to \code{exit}; -#' for novices of \code{{\link{data.table}}}, this is passed to a -#' \code{data.table}'s \code{by} argument. -#' @param breaks a named list of breaks; -#' e.g. \code{list(work = 0:20,per = 1995:2015)}; passed on to -#' \code{\link{splitMulti}} so see that function's help for more details -#' @param freezeDummy a character string; specifies the name for a dummy variable -#' that this function will create and add to output which -#' identifies rows where the \code{freezeScales} are frozen and where not -#' (\code{0} implies not frozen, \code{1} implies frozen); -#' if \code{NULL}, no dummy is created -#' @param subset a logical condition to subset data by before computations; -#' e.g. \code{subset = sex == "male"} -#' @param verbose logical; if \code{TRUE}, the function is chatty and returns -#' some messages and timings during its run. -#' @param ... additional arguments passed on to \code{\link{splitMulti}} -#' @details -#' -#' \code{prepExpo} is a convenience function for the purpose of eventually aggregating -#' person-time and events in categories of not only normally progressing -#' \code{Lexis} time scales but also some time scales which should not -#' progress sometimes. For example a person may work at a production facility -#' only intermittently, meaning exposure time (to work-related substances -#' for example) should not progress outside of periods of work. This allows for -#' e.g. a correct aggregation of person-time and events by categories of cumulative -#' time of exposure. -#' -#' Given a \code{Lexis} object containing rows (time lines) -#' where a subject is exposed to something (and NO periods without exposure), -#' fills any gaps between exposure periods for each unique combination of \code{by} -#' and the subject-specific "ultimate" \code{entry} and \code{exit} times, -#' "freezes" the cumulative exposure times in periods of no exposure, -#' and splits data using \code{breaks} passed to \code{\link{splitMulti}} -#' if requested. Results in a (split) \code{Lexis} object where \code{freezeScales} -#' do not progress in time periods where no exposure was recorded in \code{lex}. -#' -#' This function assumes that \code{entry} and \code{exit} arguments are the -#' same for each row within a unique combination of variables named in \code{by}. -#' E.g. with \code{by = "lex.id"} only each \code{lex.id} has a unique value -#' for \code{entry} and \code{exit} at most. -#' -#' The supplied \code{breaks} split the data using \code{splitMulti}, with -#' the exception that breaks supplied concerning any frozen time scales -#' ONLY split the rows where the time scales are not frozen. E.g. -#' with \code{freezeScales = "work"}, -#' \code{breaks = list(work = 0:10, cal = 1995:2010)} splits all rows over -#' \code{"cal"} but only non-frozen rows over \code{"work"}. -#' -#' Only supports frozen time scales that advance and freeze contemporaneously: -#' e.g. it would not currently be possible to take into account the cumulative -#' time working at a facility and the cumulative time doing a single task -#' at the facility, if the two are not exactly the same. On the other hand -#' one might use the same time scale for different exposure types, supply them -#' as separate rows, and identify the different exposures using a dummy variable. -#' @return -#' -#' Returns a \code{Lexis} object that has been split if \code{breaks} is specified. -#' The resulting time is also a \code{data.table} if -#' \code{options("popEpi.datatable") == TRUE} (see: \code{?popEpi}) -#' -#' @import data.table -#' @export -prepExpo <- function(lex, freezeScales = "work", cutScale = "per", entry = min(get(cutScale)), - exit = max(get(cutScale)), by = "lex.id", breaks = NULL, freezeDummy = NULL, subset = NULL, - verbose = FALSE, ...) { - ## R CMD CHECK appeasement - lex.dur <- NULL - - if (verbose) allTime <- proc.time() - - ## check breaks & data ------------------------------------------------------- - breaks <- evalq(breaks) - dumBreaks <- structure(list(c(-Inf, Inf)), names = cutScale, internal_prepExpo_dummy = TRUE) - if (is.null(breaks)) breaks <- dumBreaks - breaks <- splitMultiPreCheck(data = lex, breaks = breaks) - if (!is.null(attr(breaks, "internal_prepExpo_dummy"))) breaks <- NULL - checkLexisData(lex) - oldBreaks <- attr(lex, "breaks") - if (!is.null(breaks)) checkBreaksList(lex, breaks) - checkBreaksList(lex, oldBreaks) - - - ## data ---------------------------------------------------------------------- - - subset <- evalLogicalSubset(data = lex, substitute(subset)) - x <- if (!all(subset)) evalq(lex)[subset, ] else copy(evalq(lex)) - - setDT(x) - - allScales <- attr(lex, "time.scales") - linkScales <- setdiff(allScales, freezeScales) - othScales <- setdiff(linkScales, cutScale) - - setkeyv(x, c(by, cutScale)) - - l <- list() ## will hold temp var names; this avoids collisions with names of vars in x - l$cutScale <- cutScale - l$freezeScales <- freezeScales - l$liquidScales <- setdiff(allScales, freezeScales) - l$by <- by - rm(cutScale, freezeScales, by) - - ## args ---------------------------------------------------------------------- - if (verbose) argTime <- proc.time() - tol <- .Machine$double.eps^0.75 - - if (is.character(freezeDummy) && freezeDummy %in% names(lex)) stop("Variable named in freezeDummy already exists in data; freezeDummy is inteded for creating a new dummy for identifying the rows where freezeScales are frozen. Please supply an original variable name to freezeDummy") - - if (!is.character(l$by)) stop("by must be given as a vector of character strings naming columns in lex") - all_names_present(lex, l$by) - - enSub <- substitute(entry) - exSub <- substitute(exit) - - PF <- parent.frame(1L) - l$en <- makeTempVarName(x, pre = "entry_") - l$ex <- makeTempVarName(x, pre = "exit_") - x[, c(l$ex, l$en) := list(eval(exSub, envir = .SD, enclos = PF), - eval(enSub, envir = .SD, enclos = PF)), by = c(l$by)] - - ## tests disabled for now... - # testTime <- proc.time() - # - # test <- x[, .N, by = list(r = get(l$cutScale) + lex.dur > get(l$ex) - tol)] - # if(test[r == TRUE, .N] > 0) stop("exit must currently be higher than or equal to the maximum of cutScale (on subject basis defined using by); you may use breaks instead to limit the data") - # - # test <- x[, .N, by = list(r = get(l$cutScale) + tol < get(l$en))] - # if(test[r == TRUE, .N] > 0) stop("entry must currently be lower than or equal to the minimum of cutScale (on subject basis defined using by); you may use breaks instead to limit the data") - # if (verbose) cat("Finished checking entry and exit. Time taken: ", timetaken(argTime), "\n") - if (verbose) cat("Finished evaluating entry and exit and checking args. Time taken: ", timetaken(argTime), "\n") - - ## create rows to fill gaps -------------------------------------------------- - if (verbose) fillTime <- proc.time() - x2 <- copy(x) - x2[, (l$freezeScales) := NA] - x2 <- rbind(x2, unique(x2, by = c(l$by), fromLast = TRUE)) - - l$delta <- makeTempVarName(x2, pre = "delta_") - x2[, (l$delta) := c(get(l$en)[1], get(l$cutScale)[-c(1,.N)], max(get(l$cutScale)+lex.dur)) - get(l$cutScale), by = c(l$by)] - x2[, c(linkScales) := lapply(mget(linkScales), function(x) x + get(l$delta)), by = c(l$by)] - - setcolsnull(x2, l$delta) - - l$order <- makeTempVarName(x, pre = "order_") - x[, (l$order) := (1:.N)*2, by = c(l$by)] - x2[, (l$order) := (1:.N)*2-1, by = c(l$by)] - - x <- rbindlist(list(x, x2)) - rm(x2) - setkeyv(x, c(l$by, l$cutScale, l$order)) - setkeyv(x, c(l$by)) - set(x, j = l$order, value = as.integer(x[[l$order]])) - x[, (l$order) := 1:.N, by = c(l$by)] - - if (verbose) cat("Finished expanding data to accommodate filling gaps. Time taken: ", timetaken(fillTime), "\n") - ## handle time scale values -------------------------------------------------- - if (verbose) valueTime <- proc.time() - - l$CSE <- makeTempVarName(x, pre = paste0(l$cutScale, "_end_")) - l$LCS <- makeTempVarName(x, pre = paste0("lead1_",l$cutScale, "_")) - x[, (l$CSE) := lex.dur + get(l$cutScale)] - x[, (l$LCS) := shift(get(l$cutScale), n = 1L, type = c("lead"), fill = NA), by = c(l$by)] - - - x[!duplicated(x, fromLast = TRUE, by = key(x)), c(l$LCS, l$CSE) := get(l$ex)] - x[, (l$CSE) := pmin(get(l$LCS), get(l$CSE))] - x[, (l$cutScale) := sort(c(get(l$en)[1L],shift(get(l$CSE), n = 1L, type = "lag", fill = NA)[-1])), by = c(l$by)] - x[, lex.dur := get(l$CSE) - get(l$cutScale)] - - ## bring up other than frozen and cut scales to bear ------------------------- - x[, (othScales) := lapply(mget(othScales), function(x) {min(x) + c(0, cumsum(lex.dur)[-.N])}), by = c(l$by)] - - - ## frozen scales should make sense cumulatively ------------------------------ - ## indicates frozenness: 0 = not frozen, 1 = frozen - l$frz <- makeTempVarName(x, pre = "frozen_") - x[, (l$frz) := 0L] - frozens <- x[,is.na(get(l$freezeScales[1]))] - x[frozens, (l$frz) := 1L] - - - ## alternate method: just use lex.durs and only cumulate in non-frozen rows - x[, (l$freezeScales) := lapply(mget(l$freezeScales), function(x) { - x <- max(0, min(x-lex.dur, na.rm=TRUE)) - x <- x + c(0, as.double(cumsum(as.integer(!get(l$frz))*lex.dur))[-.N]) - }), by = c(l$by)] - - x <- x[lex.dur > .Machine$double.eps^0.5, ] - - if (verbose) cat("Finished computing correct values for time scales. Time taken: ", timetaken(valueTime), "\n") - - ## splitting separately ------------------------------------------------------ - if (!is.null(breaks)) { - if (verbose) splitTime <- proc.time() - x_frozen <- x[get(l$frz) == 1L,] - x <- x[get(l$frz) == 0L] - forceLexisDT(x, allScales = allScales, breaks = oldBreaks) - - ## NOTE: since we only split by the frozen time scales by pretending - ## they are NOT Lexis time scales (temporarily), and since one should - ## pass the appropriate breaks info of pre-existing breaks to splitMulti, - ## choose only breaks for non-frozen time scales to include in x_frozen's - ## attributes here. - ## (e.g. when work history is no longer accumulating) - frzBreaks <- breaks[l$liquidScales] - oldFrzBreaks <- oldBreaks[l$liquidScales] - emptyFrzBreaks <- vector("list", length = length(l$liquidScales)) - names(emptyFrzBreaks) <- l$liquidScales - if (length(oldFrzBreaks)) { - emptyFrzBreaks[names(oldFrzBreaks)] <- oldFrzBreaks - } - forceLexisDT(x_frozen, allScales = l$liquidScales, breaks = emptyFrzBreaks) - - if (length(frzBreaks) > 0) { - ## do (also) split for all time scales where also the frozen - ## time scales are split. This is allowed for times where the - ## frozen time scales have not been frozen - ## (e.g. work history is accumulating) - x_frozen <- splitMulti(x_frozen, breaks = frzBreaks, ...) - } - - ## do (also) split where also split - x <- splitMulti(x, breaks = breaks, ...) - breaks <- attr(x, "breaks") ## new breaks appended by splitMulti - - setDT(x) - setDT(x_frozen) - x <- rbindlist(list(x, x_frozen), use.names = TRUE); rm(x_frozen) - forceLexisDT(x, breaks = breaks, allScales = allScales) - if (verbose) cat("Finished splitting data. Time taken: ", timetaken(splitTime), "\n") - } - - ## final touch --------------------------------------------------------------- - - setDT(x) - if (is.character(freezeDummy)) setnames(x, l$frz, freezeDummy) - setkeyv(x, c(l$by, l$order)) - delCols <- setdiff(names(l), c("by", "cutScale", "freezeScales", - "liquidScales", - "linkScales", "allScales", "othScales")) - delCols <- unlist(l[delCols]) - setcolsnull(x, keep = names(lex), colorder = TRUE) - - setattr(x, "time.scales", allScales) - setattr(x, "breaks", breaks) - setattr(x, "time.since", rep("", length(allScales))) - setattr(x, "class", c("Lexis", "data.table", "data.frame")) - if (!return_DT()) setDFpe(x) - - if (verbose) cat("Finished prepExpo run. Time taken: ", timetaken(allTime), "\n") - - x[] -} - - - -doComparisonWithEpi <- function(lexDT, lexDTdrop, lexDF, breaks) { - BL <- NULL - if (!is.list(breaks)) stop("breaks needs to be a list") - requireNamespace("Epi") - requireNamespace("testthat") - - allScales <- attr(lexDF, "time.scales") - sc1 <- allScales[1] - setDT(lexDT) - setDT(lexDTdrop) - setDT(lexDF) - setkeyv(lexDT, c("lex.id", sc1)) - setkeyv(lexDTdrop, c("lex.id", sc1)) - setkeyv(lexDF, c("lex.id", sc1)) - - testthat::expect_equal(attr(lexDT, "time.scales"), attr(lexDF, "time.scales")) - testthat::expect_equal(attr(lexDT, "time.since"), attr(lexDF, "time.since")) - - testthat::expect_equal(attr(lexDTdrop, "time.scales"), attr(lexDF, "time.scales")) - testthat::expect_equal(attr(lexDTdrop, "time.since"), attr(lexDF, "time.since")) - - doTestBarrage(dt1 = lexDT, dt2 = lexDF, allScales = allScales) - rm(lexDT) - - lexDF <- intelliDrop(x = lexDF, breaks = breaks) - - doTestBarrage(dt1 = lexDTdrop, dt2 = lexDF, allScales = allScales) - -} - -doTestBarrage <- function(dt1, dt2, allScales, testTimes = TRUE, testStatuses = TRUE) { - requireNamespace("Epi") - requireNamespace("testthat") - - lex.id <- lex.dur <- NULL ## APPEASE R CMD CHECK - - testthat::expect_equal(sum(dt1$lex.dur), - sum(dt2$lex.dur), - check.attributes = FALSE) - testthat::expect_equal(dt1[, sum(lex.dur), keyby = lex.id]$V1, - dt2[, sum(lex.dur), keyby = lex.id]$V1, - check.attributes = FALSE) - - all_names_present(dt1, allScales) - all_names_present(dt2, allScales) - - if (testTimes) { - for (k in allScales) { - testthat::expect_equal(dt1[[k]], dt2[[k]], - check.attributes = TRUE) - } - } - - if (testStatuses) { - testthat::expect_equal(dt1$lex.Cst, dt2$lex.Cst, check.attributes = FALSE) - testthat::expect_equal(dt1$lex.Xst, dt2$lex.Xst, check.attributes = FALSE) - - testthat::expect_equal(levels(dt1$lex.Cst), levels(dt2$lex.Cst), check.attributes = FALSE) - testthat::expect_equal(levels(dt1$lex.Xst), levels(dt2$lex.Xst), check.attributes = FALSE) - - testthat::expect_true(all(class(dt2$lex.Cst) %in% class(dt1$lex.Cst))) - testthat::expect_true(all(class(dt2$lex.Xst) %in% class(dt1$lex.Xst))) - } - - invisible(NULL) -} - -compareSLDTWithEpi <- function(data, breaks, timeScale) { - requireNamespace("Epi") - requireNamespace("testthat") - - if (!inherits(data, "Lexis")) stop("data gotta be a Lexis object broseph") - - lexDT <- splitLexisDT(data, breaks = breaks, timeScale = timeScale, merge = TRUE, drop = FALSE) - lexDTdrop <- splitLexisDT(data, breaks = breaks, timeScale = timeScale, merge = TRUE, drop = TRUE) - lexDF <- splitLexis(data, breaks = breaks, time.scale = timeScale) ## without dropping - ## this treatment done in splitLexisDT (difftime -> integer -> double) - harmonizeNumericTimeScales(lexDF, times = c(Epi::timeScales(lexDF), "lex.dur")) - - BL <- list(breaks) - setattr(BL, "names", timeScale) - - doComparisonWithEpi(lexDT = lexDT, lexDTdrop = lexDTdrop, lexDF = lexDF, breaks = BL) - - invisible(NULL) -} - - - - - -splitMultiEpi <- function(data, breaks = list(fot = 0:5), drop) { - - for (k in names(breaks)) { - data <- splitLexis(data, breaks = breaks[[k]], time.scale = k) - } - - forceLexisDT( - data, breaks = attr(data, "breaks"), - allScales = attr(data, "time.scales"), - key = FALSE - ) - if (drop) data <- intelliDrop(data, breaks = breaks) - data -} - - - - - -compareSMWithEpi <- function(data, breaks = list(fot=0:5)) { - requireNamespace("Epi") - requireNamespace("testthat") - - lexDT <- splitMulti(data, breaks = breaks, merge = TRUE, drop = FALSE) - lexDTdrop <- splitMulti(data, breaks = breaks, merge = TRUE, drop = TRUE) - lexDF <- splitMultiEpi(data, breaks = breaks, drop = FALSE) - - doComparisonWithEpi(lexDT=lexDT, lexDTdrop = lexDTdrop, lexDF=lexDF, breaks = breaks) - - invisible(NULL) -} - - - - - -summarize_Lexis <- function(x) { - - lex.Cst <- lex.Xst <- NULL ## appease R CMD CHECK - - dur <- sum(x$lex.dur) - status_vars <- paste0("lex.", c("Cst", "Xst")) - time_scales <- copy(attr(x, "time.scales")) - dt <- copy(setDT( - mget(c(status_vars, "lex.id", time_scales[1]), as.environment(x)) - )) - setkeyv(dt, c("lex.id", time_scales[1])) - dt <- unique(dt, by = c("lex.id"), fromLast = TRUE) - - n <- dt[, .N, keyby = status_vars] - rm("dt") - n[, "transition" := paste0(lex.Cst, "->", lex.Xst)] - n <- cast_simple(data = n, columns = "transition", values = "N") - return(cbind(lex.dur = dur, n)) -} - - - - - -roll_lexis_status_inplace <- function(unsplit.data, split.data, id.var) { - - ## R CMD CHECK appeasement - lex.Cst <- lex.Xst <- NULL - - stopifnot( - is.data.table(split.data), - length(key(split.data)) > 1, - key(split.data)[1] == id.var, - key(split.data)[2] %in% attr(unsplit.data, "time.scales"), - id.var %in% names(unsplit.data), - id.var %in% names(split.data), - uniqueN(unsplit.data[[id.var]]) == nrow(unsplit.data) - ) - - status_vars <- c("lex.Cst", "lex.Xst") - status_ud <- mget_cols(c(id.var, status_vars), unsplit.data) - - join <- structure(list(split.data[[id.var]]), names = id.var) - lex_cst <- status_ud[ - i = join, - j = lex.Cst, - on = id.var - ] - storage.mode(lex_cst) <- storage.mode(split.data[["lex.Cst"]]) - set(split.data, j = status_vars, value = list(lex_cst, lex_cst)) - - wh_last_row <- which(!duplicated(split.data, by = id.var, fromLast = TRUE)) - join <- structure(list(split.data[[id.var]][wh_last_row]), names = id.var) - last_lex_xst <- status_ud[ - i = join, - j = lex.Xst, - on = id.var - ] - storage.mode(last_lex_xst) <- storage.mode(split.data[["lex.Xst"]]) - set(split.data, i = wh_last_row, j = "lex.Xst", value = last_lex_xst) - - - NULL -} - - - - -random_splitting_on <- function( - lex, - n.max.breaks = 20 -) { - stopifnot( - inherits(lex, "Lexis") - ) - - ts_nms <- attr(lex, "time.scales") - brks <- attr(lex, "breaks") - timesince <- attr(lex, "time.since") - lex_vars <- c(paste0("lex.", c("id", "Cst", "Xst", "dur")), ts_nms) - non_lex_vars <- setdiff(names(lex), lex_vars) - lex <- mget_cols(lex_vars, lex) - setattr(lex, "time.scales", ts_nms) - setattr(lex, "breaks", brks) - setattr(lex, "time.since", timesince) - setattr(lex, "class", c("Lexis", "data.table", "data.frame")) - checkLexisData(lex) - - n_split_ts <- sample(seq_along(ts_nms), 1) - split_ts_nms <- sample(ts_nms, size = n_split_ts) - - do_drop <- sample(list(FALSE, TRUE), size = 1)[[1]] - - bl <- lapply(split_ts_nms, function(split_ts_nm) { - r <- c(min(lex[[split_ts_nm]]), max(lex[[split_ts_nm]] + lex[["lex.dur"]])) - d <- diff(r) - - br_r <- if (do_drop) 2:n.max.breaks else 1:n.max.breaks - n_br <- sample(br_r, 1) - - ## allow breaks outside observed data, but at least one break must be - ## not outside range of values in data - extrema <- r + c(-1,1)*d*0.05 - l <- rep(extrema[1], n_br) - u <- rep(extrema[2], n_br) - u[1] <- l[1] <- mean(r) - unique(runif(min = l, max = u, n = n_br)) - }) - names(bl) <- split_ts_nms - - es <- ps <- lex - for (ts_nm in split_ts_nms) { - es <- Epi::splitLexis(es, breaks = bl[[ts_nm]], time.scale = ts_nm) - forceLexisDT(es, breaks = attr(es, "breaks"), allScales = ts_nms, - key = FALSE) - if (do_drop) { - es <- intelliDrop(es, breaks = bl[ts_nm]) - } - ps <- splitLexisDT(ps, breaks = bl[[ts_nm]], timeScale = ts_nm, - drop = do_drop) - } - - psm <- splitMulti(lex, breaks = bl, drop = do_drop) - - list(es = es, ps = ps, psm = psm) -} - - - - - -random_Lexis <- function( - n.rows = c(100, 1000, 2000), - n.time.scales = 1:10, - n.statuses = 2:10, - n.other.vars = 1 -) { - - row_n <- sample(as.list(n.rows), 1)[[1]] - - ts_n <- sample(as.list(n.time.scales), 1)[[1]] - - st_n <- sample(as.list(n.statuses), 1)[[1]] - - dt <- setDT(lapply(1:ts_n, function(i) { - runif(min = 0, max = 1000, n = row_n) - })) - ts_nms <- paste0("lex_ts_", formatC(seq_len(ncol(dt)), flag = "0", width = 3)) - setnames(dt, names(dt), ts_nms) - - dt[, "lex.Cst" := sample(1:st_n, size = .N, replace = TRUE)] - dt[, "lex.Xst" := sample(1:st_n, size = .N, replace = TRUE)] - dt[, "lex.id" := sample(1:.N, .N, replace = FALSE)] - dt[, "lex.dur" := runif(n = .N, min = 0, max = 10)] - - oth_n <- sample(as.list(n.other.vars), 1)[[1]] - lapply(seq_len(oth_n), function(i) { - set( - dt, j = makeTempVarName(names = names(dt), pre = "nonlexvar_"), - value = sample(1:100, size = nrow(dt), replace = TRUE) - ) - }) - - brks <- lapply(ts_nms, function(nm) NULL) - names(brks) <- ts_nms - - forceLexisDT(dt, breaks = brks, allScales = ts_nms, key = TRUE) - checkLexisData(dt, check.breaks = TRUE) - dt[] -} - - - - - -random_splitting_on_random_data <- function( - n.datasets = 100, - n.rows = 1000, - n.time.scales = 1:10, - n.breaks = 10:100, - n.statuses = 1:5, - n.other.vars = 1 -) { - - neql <- vector("list", n.datasets) - - for (i in 1:n.datasets) { - - - set.seed(get_random_seed()) - - drop <- sample(list(TRUE, FALSE), 1)[[1]] - drop <- FALSE - - dt <- random_Lexis( - n.rows = n.rows, - n.time.scales = n.time.scales, - n.statuses = n.statuses, - n.other.vars = n.other.vars - ) - - ts_names <- copy(attr(dt, "time.scales")) - ts_n <- length(ts_names) - - dt_bl <- lapply(ts_names, function(x) NULL) - names(dt_bl) <- ts_names - forceLexisDT(dt, breaks = dt_bl, allScales = ts_names) - - br_n <- unlist(sample(as.list(n.breaks), ts_n)) - names(br_n) <- ts_names - BL <- lapply(ts_names, function(ts_name) { - runif(n = br_n[[ts_name]], min = -100, max = 100) - }) - names(BL) <- ts_names - - BL <- BL[sample(ts_names, ts_n)] - - split_pop <- splitMulti(dt, breaks = BL, drop = drop) - split_epi <- splitMultiEpi(dt, breaks = BL, drop = drop) - - setkeyv(split_epi, c("lex.id", ts_names[1])) - setkeyv(split_pop, c("lex.id", ts_names[1])) - - summary_epi <- summarize_Lexis(split_epi) - summary_pop <- summarize_Lexis(split_pop) - - eq <- all.equal(summary_pop, summary_epi, check.attributes = FALSE) - if (!isTRUE(eq)) { - message("split_epi, split_pop not equal in tick ", i, "") - neql[[i]] <- mget(c( - "drop", "row_n", "ts_n", "ts_names", - "dt", "dt_bl", "br_n", "BL", - "split_epi", "split_pop", "eq", - "summary_epi", "summary_pop", - "used_seed" - )) - } - } - - neql[vapply(neql, is.null, logical(1))] - neql -} - - - - - -do_split <- function(x, ts, all.ts, breaks, drop = TRUE, merge = TRUE) { - - ## unfinished v2 splitlexisDT work horse - stopifnot( - is.integer(x[["lex.id"]]) - ) - - id_dt <- data.table( - "orig" = x[["lex.id"]], - "temp" = 1:nrow(x) - ) - set(x, j = "lex.id", value = id_dt[["temp"]]) - - split <- mget_cols(c(ts, "lex.id", "lex.dur"), x) - - BL <- structure(list(breaks), names = ts) - if (drop) { - split <- intelliCrop(split, breaks = BL, allScales = all.ts, - cropStatuses = TRUE) - split <- intelliDrop(x = split, breaks = BL, - check = FALSE, dropNegDur = TRUE) - } - n_subjects <- nrow(split) - ts_values <- split[[ts]] - - split <- rbindlist(lapply(1:length(breaks), function(i) get("split"))) - - tmp_ie_nm <- makeTempVarName(names = names(x), pre = "do_split_tmp_ie_") - - set(split, j = tmp_ie_nm, value = rep(breaks, each = n_subjects)) - set(split, j = tmp_ie_nm, value = { - pmin(split[[tmp_ie_nm]], split[[ts]] + split[["lex.dur"]]) - }) - set(split, j = ts, value = c( - ts_values, - pmax(ts_values, rep(breaks[-length(breaks)], each = n_subjects)) - )) - -} - - - - - +all_breaks_in <- function(bl1, bl2, x = NULL) { + ## INTENTION: return TRUE/FALSE depending on whether bl1 is a subset of bl2; + ## this means that each element in bl1 exists in bl2, and that those elements + ## are each subsets of the corresponding elements in bl2. + ## this is handy to check whether the some Lexis data has already + ## been split using the breaks in bl1. + ## NOTE: use checkBreakList() on each list separately before this. + + if (!is.list(bl1) || !is.list(bl2)) { + stop("Arguments bl1 and bl2 must be lists of breaks as supplied to e.g. ", + "splitMulti.") + } + + if (inherits(x, "Lexis")) { + checkLexisData(x) + checkBreaksList(x, bl1) + checkBreaksList(x, bl2) + } + + ce <- intersect(names(bl1), names(bl2)) + if (length(ce) != length(bl1)) return(FALSE) + + test <- mapply(function(l1, l2) { + all(l1 %in% l2) + }, l1 = bl1, l2 = bl2[ce], SIMPLIFY = FALSE) + + all(unlist(test)) +} + + +checkBreaksList <- function(x, breaks = list(fot = 0:5)) { + if (is.null(breaks)) stop("breaks is NULL") + if (!is.list(breaks)) stop("breaks needs to be a list") + if (!is.data.frame(x)) stop("x needs to be a data.frame") + timeScales <- names(breaks) + if (length(breaks) == 0L) stop("length of breaks list is zero") + if (length(timeScales) != length(breaks)) stop("breaks needs to be a fully named list") + + bad_scales <- setdiff(timeScales, names(x)) + if (length(bad_scales) > 0) { + stop("at least one breaks list name wasn't a variable in data; bad names: ", + paste0("'", bad_scales, "'", collapse = ", ")) + } + lens <- lapply(breaks, function(el) if (is.null(el)) -1 else length(el)) + badLens <- names(lens[unlist(lens) == 0L]) + if (length(badLens)) { + badLens <- paste0("'", badLens, "'", collapse = ", ") + stop("Elements in breaks list for the following time scales were of ", + "length zero but not NULL: ", badLens, ". Breaks list may only ", + "contain elements of length > 0 or elements that are NULL.") + } + invisible(NULL) +} + +checkPophaz <- function(lex, ph, haz.name = "haz") { + ## INTENTION: checks a Lexis data set against the pophaz data set for + ## consistency (e.g. existing variables to merge by) + + if (!is.data.frame(ph)) { + stop("Data set containing population/expected hazards must be a data.frame", + " (or a data.table, which is also a data.frame).") + } + + if (!haz.name %in% names(ph)) { + stop("Data set containing population/expected hazards does not contain a ", + "column named 'haz'. Make sure the name is exactly that (", + "case sensitive).") + } + + if (haz.name %in% names(lex)) { + stop("Lexis data set already contains a column named 'haz', which is a ", + "reserved name for the population hazard variable to be merged. ", + "Please rename/delete 'haz' from/in your Lexis data first.") + } + + if (!is.data.frame(ph)) { + stop("Data set of expected/population hazards must be a data.frame.") + } + + bn <- setdiff(names(ph), haz.name) + + if (length(bn) == 0L) { + stop("No variables in expected/population hazards data set to use in merge ", + "with Lexis data. Ensure that the pop. haz. data set containts some ", + "variables to merge by (e.g. sex, calendar year, and age group)") + } + if (!all(bn %in% names(lex))) { + badbn <- paste0("'", setdiff(bn, names(lex)), "'", collapse = ", ") + stop("Lexis data set did not have following variable(s) that were in ", + "the expected/population hazards data set: ", badbn,". ", + "Ensure you have supplied the right data and that the names of the ", + "intended variables match.") + } + + mergeVars <- setdiff(names(ph), haz.name) + dup <- any(duplicated(as.data.table(ph), by = mergeVars)) + if (dup) { + stop("Supplied data set of population/expected hzards has duplicated rows ", + "by the variables ", paste0("'",mergeVars, "'", collapse = ", "), + " which prevents correct usage of the data set. Please ensure no rows", + " area duplicated in the data set before proceeding. Tip: use e.g. ", + "duplicated(PH, by = c('V1', 'V2')) to check for duplicatedness in ", + "your data set (here named PH) by the variables V1 and V2." + ) + } + + invisible() +} + + + + + +intelliCrop <- function( + x, + breaks = list(fot = 0:5), + allScales = NULL, + cropStatuses = FALSE, + tol = .Machine$double.eps^0.5 +) { + + ## appease R CMD CHECK + lex.dur <- lex.Xst <- lex.Cst <- NULL + + checkBreaksList(x = x, breaks = breaks) + breaks[unlist(lapply(breaks, length)) == 0L] <- NULL + if (!is.data.table(x)) stop("x needs to be a data.table") + + cropScales <- names(breaks) + + all_names_present(x, c("lex.dur", allScales)) + + if (cropStatuses) { + origEnd <- x$lex.dur + x[[allScales[1L]]] + } + + + deltas <- mapply(function(b, y) pmax(min(b), y) - y, SIMPLIFY = FALSE, + b = breaks, y = mget_cols(cropScales, x)) + ## below: baseline (zero value without assigning zero of bad class) + deltas <- c(deltas, list(x[[cropScales[1]]][1L] - x[[cropScales[1]]][1L])) + deltas <- do.call(pmax, deltas) + + set(x, j = allScales, value = mget_cols(allScales, x) + deltas) + set(x, j = "lex.dur", value = x[["lex.dur"]] - deltas) + + durs <- mapply(function(b, y) max(b) - y, SIMPLIFY = FALSE, + b = breaks, y = mget_cols(cropScales, x)) + durs$lex.dur <- x$lex.dur + durs <- do.call(pmin, durs) + ## now have max durs by row, i.e. up to roof of breaks at most, + ## or to ((original lex.dur) - (deltas)) if that is smaller. + ## (being cropped or exiting before roof of breaks) + + set(x, j = "lex.dur", value = durs) + + if (cropStatuses) { + harmonizeStatuses(x, C = "lex.Cst", X = "lex.Xst") + wh_was_cropped <- which(x[["lex.dur"]] + x[[allScales[1L]]] + tol < origEnd) + set(x, i = wh_was_cropped, j = "lex.Xst", + value = x[["lex.Cst"]][wh_was_cropped]) + } + + invisible(x) +} + + + + + +harmonizeStatuses <- function(x, C = "lex.Cst", X = "lex.Xst") { + + clC <- class(x[[C]]) + clX <- class(x[[X]]) + tyC <- typeof(x[[C]]) + tyX <- typeof(x[[X]]) + cl <- c(clC, clX) + + if (tyC != tyX && clC != clX) { + if (is.numeric(x[[C]]) && is.numeric(x[[X]])) { + harmonizeNumeric(x = x, v1="lex.Cst", v2="lex.Xst") + + } else if (is.factor(x[[C]]) || is.factor(x[[X]])) { + if (!is.factor(x[[C]])) set(x, j = C, value = as.factor(x[[C]])) + if (!is.factor(x[[X]])) set(x, j = X, value = as.factor(x[[X]])) + + } + } + + if (any(cl == "factor")) { + harmonizeFactors(x = x, v1="lex.Cst", v2="lex.Xst") + } + +} + +harmonizeNumericTimeScales <- function(x, times = NULL) { + ## INTENTION: given a Lexis data set with some time scales, ensure + ## that the classes of the time scales comply to the lowest denominator, + ## e.g. "double" and "integer" -> both "double" + + if (is.null(times)) { + times <- c(attr(x, "time.scales"), "lex.dur") + } + + msg <- paste0("Expected working data to have time scales %%VARS%%, but it ", + "didn't. This is an internal error: If you see this, complain ", + "to the package maintainer.") + all_names_present(x, times, msg = msg) + xt <- lapply(times, function(ch) x[[ch]]) + names(xt) <- times + + harmoClasses <- c("numeric", "integer", "difftime") + cl <- lapply(xt, class) + wh <- unlist(lapply(cl, function(ch) { + any(ch %in% harmoClasses) + })) + ha <- times[wh] + hacl <- unique(unlist(cl[wh])) + + if (length(ha) > 1L) { + ## more than one class present and need to use common lowest denom + newMode <- as.double + + if (all(ha %in% c("integer", "difftime"))) { + ## all numeric times are integers or difftimes + newMode <- as.integer + } + for (var in ha) { + ## modify in place + set(x, j = var, value = newMode(x[[var]])) + } + + + } + invisible(NULL) +} + + + + + +harmonizeNumeric <- function(x, v1="lex.Cst", v2="lex.Xst") { + ## assumes v1, v2 are numeric variable names in x + + if (!is.numeric(x[[v1]]) || !is.numeric(x[[v2]])) { + print(class(x[[v1]])) + print(class(x[[v2]])) + stop("v1 and/or v2 is/are not of class numeric") + } + + if (!is.integer(x[[v1]])) set(x, j = v1, value = try2int(x[[v1]])) + if (!is.integer(x[[v2]])) set(x, j = v2, value = try2int(x[[v2]])) + + if (typeof(x[[v1]]) != typeof(x[[v2]])) { + + if (is.double(x[[v1]])) set(x, j = v1, value = as.double(x[[v1]])) + if (is.double(x[[v2]])) set(x, j = v2, value = as.double(x[[v2]])) + + } + +} + + + + + +harmonizeFactors <- function(x, v1="lex.Cst", v2="lex.Xst") { + ## assumes v1, v2 are factor names in x + + if (!is.factor(x[[v1]]) || !is.factor(x[[v2]])) { + stop("v1 and/or v2 is/are not of class factor") + } + + glab1 <- union(levels(x[[v1]]), levels(x[[v2]])) + glab2 <- union(levels(x[[v2]]), levels(x[[v1]])) + + + + setattr(x[[v1]], "levels", glab1) + setattr(x[[v2]], "levels", glab2) + +} + + + + + +intelliDrop <- function(x, breaks = list(fot = 0:5), dropNegDur = TRUE, check = FALSE, tol = .Machine$double.eps^0.5, subset = NULL) { + + if (!is.data.table(x)) { + stop("x needs to be a data.table; if you see this message, complain ", + "to the package maintainer") + } + checkBreaksList(x = x, breaks = breaks) + breaks[unlist(lapply(breaks, length)) < 2] <- NULL + timeScales <- names(breaks) + + if (check) { + checkLexisData(x) + } + + ra <- lapply(breaks, range) + ra <- lapply(ra, diff) + ts <- names(sort(unlist(ra))) ## shortest first + mi <- lapply(breaks, min) + ma <- lapply(breaks, max) + + substi <- substitute(subset) + subset <- evalLogicalSubset(x, substiset = substi) + + if (dropNegDur) subset[subset] <- subset[subset] & x$lex.dur[subset] > 0L + + ## figure out latest exit and first entry; don't need to test for dropping + ## if e.g. all left follow-up before the max in breaks + max_end <- lapply(ts, function(ch) min(x[[ch]] + x$lex.dur)) + min_start <- lapply(ts, function(ch) max(x[[ch]])) + names(max_end) <- names(min_start) <- ts + + for (k in ts) { + mik <- mi[[k]] + mak <- ma[[k]] + + if (max_end[[k]] < mak + tol) { + tmpSD <- x[subset, .SD, .SDcols = c(k, "lex.dur")] + tmpSD <- setDT(lapply(tmpSD, as.numeric)) + subset[subset] <- rowSums(tmpSD) <= mak + tol + } + if (min_start[[k]] + tol > mik) { + subset[subset] <- x[subset,][[k]] > mik - tol + } + + if (all(!subset)) { + stop("Dropped all remaining rows from data when subsetting by the ", + "Lexis time scale '", k, "'. Range of values in data: ", + paste0(round(range(x[[k]]),4), collapse = "-"), ". Min/Max breaks ", + "(used to subset data): ", mik, "/", mak, ".") + } + + } + + + x[subset, ] +} + + +matchBreakTypes <- function(lex, breaks, timeScale, modify.lex = FALSE) { + if (is.character(breaks)) { + breaks <- as.IDate(breaks) + } + clb <- class(breaks) + clb <- clb[length(clb)] + cts <- class(lex[[timeScale]]) + cts <- cts[length(cts)] + + if (clb != cts) { + if (is.Date(breaks) && !is.Date(lex[[timeScale]])) { + breaks <- try2int(as.double(breaks)) + } else if (is.integer(breaks) && is.double(lex[[timeScale]])) { + breaks <- as.double(breaks) + } else if (is.double(breaks) && is.integer(lex[[timeScale]])) { + breaks <- try2int(breaks) + } + + } + + if (modify.lex && clb != cts) { + if (!is.Date(breaks) && is.Date(lex[[timeScale]])) { + + if (clb == "double") { + set(lex, j = timeScale, value = as.double(lex[[timeScale]])) + } else { + set(lex, j = timeScale, value = as.integer(lex[[timeScale]])) + } + + } else if (is.double(breaks) && is.integer(lex[[timeScale]])) { + set(lex, j = timeScale, value = as.double(lex[[timeScale]])) + } + + } + breaks +} + +protectFromDrop <- function(breaks, lower = FALSE) { + old_breaks <- copy(breaks) + if (length(breaks) == 0L) { + stop("Length of breaks to 'protect' from dropping is zero.") + } + if (is.Date(breaks)) { + breaks <- c(breaks, max(breaks) + 1e4L) + if (lower) breaks <- c(min(breaks) - 1e4L, breaks) + + } else if (is.integer(breaks)) { + breaks <- c(breaks, 1e6L) + if (lower) breaks <- c(-1e6L, breaks) + + } else if (is.double(breaks)) { + breaks <- c(breaks, Inf) + if (lower) breaks <- c(-Inf, breaks) + + } else { + stop("breaks were not Date, integer or double") + } + setattr(breaks, "unprotected", old_breaks) + breaks +} + +unprotectFromDrop <- function(breaks) { + up <- attr(breaks, "unprotected") + if (is.null(up) || length(up) == 0L) { + stop("Could not 'unprotect' breaks from dropping as the required ", + "attribute was not found. If you see this it is most likely ", + "an internal error and you should complain to the pkg maintainer.") + } + up +} + + + + +setLexisDT <- function(data, entry, exit, entry.status, exit.status, id = NULL, select = NULL) { + + ## appease R CMD CHECK + lex.Cst <- lex.Xst <- NULL + + if (!is.data.table(data)) stop("not a data.table") + if (inherits(data, "Lexis")) stop("already a Lexis object") + + if (!is.null(select) && !is.character(select)) stop("select was not a character vector of names") + + entry <- substitute(entry) + exit <- substitute(exit) + entry <- eval(entry, envir = data, enclos = parent.frame()) + exit <- eval(exit, envir = data, enclos = parent.frame()) + enNames <- names(entry) + exNames <- names(exit) + + timeScales <- union(enNames, exNames) + if (any(timeScales %in% names(data))) stop("at least one named time scales already present in data; original names mandatory") + enNeeded <- setdiff(timeScales, enNames) + enPresent <- setdiff(timeScales, enNeeded) + durVar <- intersect(enNames, exNames) + if (length(durVar) > 1) stop("you have more than 1 time scales in both entry and exit; only one mandatory") + + enVars <- paste0(enNames, "_en") + exVars <- paste0(exNames, "_ex") + setattr(entry, "names", enVars) + setattr(exit, "names", exVars) + + l <- as.data.table(c(entry, exit)) + rm(entry, exit) + + ## duration + exV <- paste0(durVar, "_ex") + enV <- paste0(durVar, "_en") + set(l, j = "lex.dur", value = l[[exV]] - l[[enV]]) + rm(exV, enV) + + ## time scale starting points + if (length(enNeeded) > 0) { + for (ts in enNeeded) { + exV <- paste0(ts, "_ex") + set(l, j = ts, value = l[[exV]] - l$lex.dur) + } + } + setnames(l, paste0(enPresent, "_en"), enPresent) + + # no longer need time scale end points + for (k in exVars) { + set(l, j = k, value = NULL) + } + + ## status definition + data[, lex.Cst := entry.status] + data[, lex.Xst := exit.status] + + harmonizeStatuses(data, C = "lex.Cst", X = "lex.Xst") + + ## all time scales etc. into data + data[, names(l) := l] + + + id <- substitute(id) + id <- eval(id, envir = data, enclos = parent.frame()) + if (!is.null(id)) set(data, j = "lex.id", value = id) + rm(id) + + if (!is.null(select)) { + + delVars <- setdiff(names(data), c(names(l), select)) + if (length(delVars) > 0) { + l[, (delVars) := NULL] + } + } + + rm(l) + lexVars <- c("lex.id", timeScales, "lex.dur", "lex.Cst", "lex.Xst") + setcolorder(data, c(lexVars, setdiff(names(data), lexVars))) + + setattr(data, "time.scales", timeScales) + setattr(data, "time.since", rep("", times = length(timeScales))) + setattr(data, "class", c("Lexis", "data.table", "data.frame")) + + +} + +checkLexisData <- function(lex, check.breaks = FALSE) { + ## INTENTION: checks Lexis attributes + ## OUTPUT: nothing + + if (is.null(lex) || nrow(lex) == 0) stop("Data is NULL or has zero rows") + if (!inherits(lex, "Lexis")) stop("Data not a Lexis object") + allScales <- attr(lex, "time.scales") + if (length(allScales) == 0) stop("no time scales appear to be defined; is data a Lexis object?") + + badScales <- setdiff(allScales, names(lex)) + if (length(badScales) > 0) { + badScales <- paste0("'", badScales, "'", collapse = ", ") + stop("Following time scales found in data's attributes but not present in data: ", badScales) + } + + lexVars <- c("lex.dur", "lex.id", "lex.Cst", "lex.Xst") + blv <- setdiff(lexVars, names(lex)) + if (length(blv) > 0) { + blv <- paste0("'", blv, "'", collapse = ", ") + stop("Following Lexis variables not found in data: ", blv) + } + + if (check.breaks) { + BL <- attr(lex, "breaks") + if (is.null(BL)) stop("No breaks list in data attributes") + checkBreaksList(lex, breaks = BL) + } + + invisible() +} + + +splitMultiPreCheck <- function(data = NULL, breaks = NULL, ...) { + + ## INTENTION: checks for discrepancies between data and breaks, etc. + ## OUTPUT: cleaned-up list of breaks + checkLexisData(data) + allScales <- attr(data, "time.scales") + + if (!is.null(breaks) && !is.list(breaks)) stop("breaks must be a list; see examples in ?splitMulti") + if (is.null(breaks)) { + breaks <- list(...) + breaks <- breaks[intersect(names(breaks), allScales)] + } + + if (length(breaks) == 0) stop("no breaks defined!") + + splitScales <- names(breaks) + ## NULL breaks imply not used + for (k in splitScales) { + if (length(breaks[[k]]) == 0) { + breaks[k] <- NULL + } + } + + checkBreaksList(x = data, breaks = breaks) + + splitScales <- names(breaks) + + if (!all(splitScales %in% allScales)) { + stop("breaks must be a list with at least one named vector corresponding to used time scales \n + e.g. breaks = list(fot = 0:5)") + } + + if (!all(splitScales %in% names(data))) { + stop("At least one vector name in breaks list is not a variable name in the data") + } + breaks +} + +forceLexisDT <- function(x, breaks = NULL, allScales = NULL, key = TRUE) { + setattr(x, "class", c("Lexis", "data.table", "data.frame")) + setattr(x, "breaks", breaks) + setattr(x, "time.scales", allScales) + # alloc.col(x) + if (key) setkeyv(x, c("lex.id", names(breaks)[1L])) + invisible(x) +} + + +doCutLexisDT <- function(lex, cut = dg_date, timeScale = "per", by = "lex.id", n = 1L) { + + checkLexisData(lex, check.breaks = FALSE) + + x <- unique(lex, by = by) + cut <- evalq(cut, envir = lex, enclos = parent.frame(n = n + 1L)) + delta <- cut - x[[timeScale]] + + allScales <- attr(lex, "time.scales") + + setDT(x) + for (v in allScales) { + set(x, j = v, value = x[[v]] + delta) + } + + set(x, j = "lex.dur", value = 0) + + tmp <- list() + tmp$isCut <- makeTempVarName(lex, pre = "isCut_") + + set(x, j = tmp$isCut, value = 1L) + on.exit(setcolsnull(x, unlist(tmp))) + + x <- rbindlist(list(lex, x), use.names = TRUE, fill = TRUE) + x[1:nrow(lex), (tmp$isCut) := 0L] + + ## NOTE: new cut row being the first or last row + ## implies it resides outside old observations + ## OR it is equal to lowest/highest value + setkeyv(x, c(by, allScales)) + setkeyv(x, by) + x <- x[!((duplicated(x, by = key(x)) | duplicated(x, by = key(x), fromLast = TRUE)) & x[[tmp$isCut]] == 0L)] + stop("not ready") +} + +# data <- data.table(birth = 2000:2000, entry=2002:2003, +# exit=2011:2012, event=c(2010,2011), +# status=1:0) +# +# lex <- lexpand(data = data, birth = birth, entry = entry, +# exit = exit, event = event, +# id = 1L, entry.status = 99L, +# status = status, overlapping = T) + +lexpile <- function(lex, by = "lex.id", subset = NULL) { + ## PURPOSE: given several rows per id in a Lexis object, + ## collate data into form where + ## - no subject has any overlapping time lines + ## - lex.Cst and lex.Xst are logical, i.e. 0 -> 1, 1 -> 1, 1 -> 2 + ## this should be made to work with both split and unsplit Lexis data. + + data <- NULL # R CMD CHECK appeasement + + checkLexisData(lex, check.breaks = FALSE) + + allScales <- attr(lex, "time.scales") + sc <- allScales[1L] + + all_names_present(lex, by) + + if (is.character(lex$lex.Cst) || is.character(lex$lex.Xst)) { + stop("This function requires lex.Cst and lex.Xst to be integer, double (i.e. numeric) or factor variables to determine the order of possible statuses!") + } + + ## need to take copy eventually ---------------------------------------------- + attrs <- attributes(lex) + subset <- evalLogicalSubset(data, substitute(subset)) + x <- lex[subset,] + forceLexisDT(x, breaks = attrs$breaks, allScales = attrs$time.scales) + alloc.col(x) + + + ## ensure status harmony ----------------------------------------------------- + harmonizeStatuses(x = x, X = "lex.Xst", C = "lex.Cst") + exStat <- if (is.factor(x$lex.Xst)) levels(x$lex.Xst) else sort(unique(x$lex.Xst)) + enStat <- if (is.factor(x$lex.Cst)) levels(x$lex.Cst) else sort(unique(x$lex.Cst)) + allStat <- c(setdiff(enStat, exStat), exStat) ## enStat & exStat equal if factors used + + ## avoiding side effects ----------------------------------------------------- + oldKey <- key(lex) + tmp <- list() + tmp$order<- makeTempVarName(x, pre = "order_") + + on.exit({ + if (length(oldKey) > 0) setkeyv(x, oldKey) else + setorderv(x, tmp$order) + }, add = TRUE) + + on.exit({ + setcolsnull(x, unlist(tmp$order), soft = TRUE) + }, add = TRUE) + + x[, c(tmp$order) := 1:.N] + + ## check for need for lexpiling ---------------------------------------------- + setkeyv(x, by) + if (sum(duplicated(x, by = key(x))) == 0L) return(lex) + + + ## figure out what statuses are used ----------------------------------------- + + tmp$ev <- makeTempVarName(x, pre = "event_") + x[, c(tmp$ev) := detectEvents(x, breaks = attrs$breaks, by = by)] + + tmp$scEnds <- paste0(allScales, "_end") + tmp$scEnds <- makeTempVarName(lex, pre = tmp$scEnds) + x[, c(tmp$scEnds) := lapply(.SD, function(x) x + lex$lex.dur), .SDcols = allScales] + + ## NOTE: rows for a given subject ending in simultaneously with at least + ## one being a transition will not be allowed. + setkeyv(x, c(by, tmp$scEnds[1L])) + + whDup <- duplicated(x, fromLast = FALSE, by = key(x)) | duplicated(x, fromLast = TRUE, by = key(x)) + dupTest <- x[whDup, 1L %in% unique(.SD), .SDcols = tmp$ev] + rm(whDup) + + if (dupTest) stop("At least one subject had at least two simultaneous events.") + ## NOTE: if interval ends AND status are the very same for M rows, + ## then the M rows are necessarily nested with one or more covering + ## the whole time line. Only need to keep the one. + setkeyv(x, c(tmp$scEnds, "lex.Cst", "lex.Xst")) + setorderv(x, c(allScales,tmp$scEnds, "lex.Cst", "lex.Xst")) + + x <- unique(x, by = key(x)) + stop("unfinished") + +} + +contractLexis <- function(x, breaks, drop = TRUE) { + stop("This doesnt do anything yet") + ## INTENTION: given a Lexis object and breaks, + ## ensures data is split by the breaks and contracts the split rows + ## so that the data is split at the level of the supplied breaks. + ## e.g. with x split by fot = seq(0, 5, 1/12) and with supplying + ## breaks = list(fot = 0:5), rows within 0-1 are collated into one row etc. + + ## PROBLEM: a subject may have e.g. rows spaning 0.5 - 1 which are requested + ## to be contracted to one row spanning 0-1. + + +} + + + + +#' @title Prepare Exposure Data for Aggregation +#' @description \code{prepExpo} uses a \code{Lexis} object of periods of exposure +#' to fill gaps between the periods and overall entry and exit times without +#' accumulating exposure time in periods of no exposure, and splits the +#' result if requested. +#' @param lex a \code{\link[Epi]{Lexis}} object with ONLY periods of exposure +#' as rows; one or multiple rows per subject allowed +#' @param freezeScales a character vector naming \code{Lexis} time scales of exposure +#' which should be frozen in periods where no exposure occurs (in the gap +#' time periods) +#' @param cutScale the \code{Lexis} time scale along which the subject-specific +#' ultimate entry and exit times are specified +#' @param entry an expression; the time of entry to follow-up which may be earlier, at, or after +#' the first time of exposure in \code{freezeScales}; evaluated separately +#' for each unique combination of \code{by}, so e.g. with +#' \code{entry = min(Var1)} and \code{by = "lex.id"} it +#' sets the \code{lex.id}-specific minima of \code{Var1} to be the original times +#' of entry for each \code{lex.id} +#' @param exit the same as \code{entry} but for the ultimate exit time per unique +#' combination of \code{by} +#' @param by a character vector indicating variable names in \code{lex}, +#' the unique combinations of which identify separate subjects for which +#' to fill gaps in the records from \code{entry} to \code{exit}; +#' for novices of \code{{\link{data.table}}}, this is passed to a +#' \code{data.table}'s \code{by} argument. +#' @param breaks a named list of breaks; +#' e.g. \code{list(work = 0:20,per = 1995:2015)}; passed on to +#' \code{\link{splitMulti}} so see that function's help for more details +#' @param freezeDummy a character string; specifies the name for a dummy variable +#' that this function will create and add to output which +#' identifies rows where the \code{freezeScales} are frozen and where not +#' (\code{0} implies not frozen, \code{1} implies frozen); +#' if \code{NULL}, no dummy is created +#' @param subset a logical condition to subset data by before computations; +#' e.g. \code{subset = sex == "male"} +#' @param verbose logical; if \code{TRUE}, the function is chatty and returns +#' some messages and timings during its run. +#' @param ... additional arguments passed on to \code{\link{splitMulti}} +#' @details +#' +#' \code{prepExpo} is a convenience function for the purpose of eventually aggregating +#' person-time and events in categories of not only normally progressing +#' \code{Lexis} time scales but also some time scales which should not +#' progress sometimes. For example a person may work at a production facility +#' only intermittently, meaning exposure time (to work-related substances +#' for example) should not progress outside of periods of work. This allows for +#' e.g. a correct aggregation of person-time and events by categories of cumulative +#' time of exposure. +#' +#' Given a \code{Lexis} object containing rows (time lines) +#' where a subject is exposed to something (and NO periods without exposure), +#' fills any gaps between exposure periods for each unique combination of \code{by} +#' and the subject-specific "ultimate" \code{entry} and \code{exit} times, +#' "freezes" the cumulative exposure times in periods of no exposure, +#' and splits data using \code{breaks} passed to \code{\link{splitMulti}} +#' if requested. Results in a (split) \code{Lexis} object where \code{freezeScales} +#' do not progress in time periods where no exposure was recorded in \code{lex}. +#' +#' This function assumes that \code{entry} and \code{exit} arguments are the +#' same for each row within a unique combination of variables named in \code{by}. +#' E.g. with \code{by = "lex.id"} only each \code{lex.id} has a unique value +#' for \code{entry} and \code{exit} at most. +#' +#' The supplied \code{breaks} split the data using \code{splitMulti}, with +#' the exception that breaks supplied concerning any frozen time scales +#' ONLY split the rows where the time scales are not frozen. E.g. +#' with \code{freezeScales = "work"}, +#' \code{breaks = list(work = 0:10, cal = 1995:2010)} splits all rows over +#' \code{"cal"} but only non-frozen rows over \code{"work"}. +#' +#' Only supports frozen time scales that advance and freeze contemporaneously: +#' e.g. it would not currently be possible to take into account the cumulative +#' time working at a facility and the cumulative time doing a single task +#' at the facility, if the two are not exactly the same. On the other hand +#' one might use the same time scale for different exposure types, supply them +#' as separate rows, and identify the different exposures using a dummy variable. +#' @return +#' +#' Returns a \code{Lexis} object that has been split if \code{breaks} is specified. +#' The resulting time is also a \code{data.table} if +#' \code{options("popEpi.datatable") == TRUE} (see: \code{?popEpi}) +#' +#' @import data.table +#' @export +prepExpo <- function(lex, freezeScales = "work", cutScale = "per", entry = min(get(cutScale)), + exit = max(get(cutScale)), by = "lex.id", breaks = NULL, freezeDummy = NULL, subset = NULL, + verbose = FALSE, ...) { + ## R CMD CHECK appeasement + lex.dur <- NULL + + if (verbose) allTime <- proc.time() + + ## check breaks & data ------------------------------------------------------- + breaks <- evalq(breaks) + dumBreaks <- structure(list(c(-Inf, Inf)), names = cutScale, internal_prepExpo_dummy = TRUE) + if (is.null(breaks)) breaks <- dumBreaks + breaks <- splitMultiPreCheck(data = lex, breaks = breaks) + if (!is.null(attr(breaks, "internal_prepExpo_dummy"))) breaks <- NULL + checkLexisData(lex) + oldBreaks <- attr(lex, "breaks") + if (!is.null(breaks)) checkBreaksList(lex, breaks) + checkBreaksList(lex, oldBreaks) + + + ## data ---------------------------------------------------------------------- + + subset <- evalLogicalSubset(data = lex, substitute(subset)) + x <- if (!all(subset)) evalq(lex)[subset, ] else copy(evalq(lex)) + + setDT(x) + + allScales <- attr(lex, "time.scales") + linkScales <- setdiff(allScales, freezeScales) + othScales <- setdiff(linkScales, cutScale) + + setkeyv(x, c(by, cutScale)) + + l <- list() ## will hold temp var names; this avoids collisions with names of vars in x + l$cutScale <- cutScale + l$freezeScales <- freezeScales + l$liquidScales <- setdiff(allScales, freezeScales) + l$by <- by + rm(cutScale, freezeScales, by) + + ## args ---------------------------------------------------------------------- + if (verbose) argTime <- proc.time() + tol <- .Machine$double.eps^0.75 + + if (is.character(freezeDummy) && freezeDummy %in% names(lex)) stop("Variable named in freezeDummy already exists in data; freezeDummy is inteded for creating a new dummy for identifying the rows where freezeScales are frozen. Please supply an original variable name to freezeDummy") + + if (!is.character(l$by)) stop("by must be given as a vector of character strings naming columns in lex") + all_names_present(lex, l$by) + + enSub <- substitute(entry) + exSub <- substitute(exit) + + PF <- parent.frame(1L) + l$en <- makeTempVarName(x, pre = "entry_") + l$ex <- makeTempVarName(x, pre = "exit_") + x[, c(l$ex, l$en) := list(eval(exSub, envir = .SD, enclos = PF), + eval(enSub, envir = .SD, enclos = PF)), by = c(l$by)] + + ## tests disabled for now... + # testTime <- proc.time() + # + # test <- x[, .N, by = list(r = get(l$cutScale) + lex.dur > get(l$ex) - tol)] + # if(test[r == TRUE, .N] > 0) stop("exit must currently be higher than or equal to the maximum of cutScale (on subject basis defined using by); you may use breaks instead to limit the data") + # + # test <- x[, .N, by = list(r = get(l$cutScale) + tol < get(l$en))] + # if(test[r == TRUE, .N] > 0) stop("entry must currently be lower than or equal to the minimum of cutScale (on subject basis defined using by); you may use breaks instead to limit the data") + # if (verbose) cat("Finished checking entry and exit. Time taken: ", timetaken(argTime), "\n") + if (verbose) cat("Finished evaluating entry and exit and checking args. Time taken: ", timetaken(argTime), "\n") + + ## create rows to fill gaps -------------------------------------------------- + if (verbose) fillTime <- proc.time() + x2 <- copy(x) + x2[, (l$freezeScales) := NA] + x2 <- rbind(x2, unique(x2, by = c(l$by), fromLast = TRUE)) + + l$delta <- makeTempVarName(x2, pre = "delta_") + x2[, (l$delta) := c(get(l$en)[1], get(l$cutScale)[-c(1,.N)], max(get(l$cutScale)+lex.dur)) - get(l$cutScale), by = c(l$by)] + x2[, c(linkScales) := lapply(mget(linkScales), function(x) x + get(l$delta)), by = c(l$by)] + + setcolsnull(x2, l$delta) + + l$order <- makeTempVarName(x, pre = "order_") + x[, (l$order) := (1:.N)*2, by = c(l$by)] + x2[, (l$order) := (1:.N)*2-1, by = c(l$by)] + + x <- rbindlist(list(x, x2)) + rm(x2) + setkeyv(x, c(l$by, l$cutScale, l$order)) + setkeyv(x, c(l$by)) + set(x, j = l$order, value = as.integer(x[[l$order]])) + x[, (l$order) := 1:.N, by = c(l$by)] + + if (verbose) cat("Finished expanding data to accommodate filling gaps. Time taken: ", timetaken(fillTime), "\n") + ## handle time scale values -------------------------------------------------- + if (verbose) valueTime <- proc.time() + + l$CSE <- makeTempVarName(x, pre = paste0(l$cutScale, "_end_")) + l$LCS <- makeTempVarName(x, pre = paste0("lead1_",l$cutScale, "_")) + x[, (l$CSE) := lex.dur + get(l$cutScale)] + x[, (l$LCS) := shift(get(l$cutScale), n = 1L, type = c("lead"), fill = NA), by = c(l$by)] + + + x[!duplicated(x, fromLast = TRUE, by = key(x)), c(l$LCS, l$CSE) := get(l$ex)] + x[, (l$CSE) := pmin(get(l$LCS), get(l$CSE))] + x[, (l$cutScale) := sort(c(get(l$en)[1L],shift(get(l$CSE), n = 1L, type = "lag", fill = NA)[-1])), by = c(l$by)] + x[, lex.dur := get(l$CSE) - get(l$cutScale)] + + ## bring up other than frozen and cut scales to bear ------------------------- + x[, (othScales) := lapply(mget(othScales), function(x) {min(x) + c(0, cumsum(lex.dur)[-.N])}), by = c(l$by)] + + + ## frozen scales should make sense cumulatively ------------------------------ + ## indicates frozenness: 0 = not frozen, 1 = frozen + l$frz <- makeTempVarName(x, pre = "frozen_") + x[, (l$frz) := 0L] + frozens <- x[,is.na(get(l$freezeScales[1]))] + x[frozens, (l$frz) := 1L] + + + ## alternate method: just use lex.durs and only cumulate in non-frozen rows + x[, (l$freezeScales) := lapply(mget(l$freezeScales), function(x) { + x <- max(0, min(x-lex.dur, na.rm=TRUE)) + x <- x + c(0, as.double(cumsum(as.integer(!get(l$frz))*lex.dur))[-.N]) + }), by = c(l$by)] + + x <- x[lex.dur > .Machine$double.eps^0.5, ] + + if (verbose) cat("Finished computing correct values for time scales. Time taken: ", timetaken(valueTime), "\n") + + ## splitting separately ------------------------------------------------------ + if (!is.null(breaks)) { + if (verbose) splitTime <- proc.time() + x_frozen <- x[get(l$frz) == 1L,] + x <- x[get(l$frz) == 0L] + forceLexisDT(x, allScales = allScales, breaks = oldBreaks) + + ## NOTE: since we only split by the frozen time scales by pretending + ## they are NOT Lexis time scales (temporarily), and since one should + ## pass the appropriate breaks info of pre-existing breaks to splitMulti, + ## choose only breaks for non-frozen time scales to include in x_frozen's + ## attributes here. + ## (e.g. when work history is no longer accumulating) + frzBreaks <- breaks[l$liquidScales] + oldFrzBreaks <- oldBreaks[l$liquidScales] + emptyFrzBreaks <- vector("list", length = length(l$liquidScales)) + names(emptyFrzBreaks) <- l$liquidScales + if (length(oldFrzBreaks)) { + emptyFrzBreaks[names(oldFrzBreaks)] <- oldFrzBreaks + } + forceLexisDT(x_frozen, allScales = l$liquidScales, breaks = emptyFrzBreaks) + + if (length(frzBreaks) > 0) { + ## do (also) split for all time scales where also the frozen + ## time scales are split. This is allowed for times where the + ## frozen time scales have not been frozen + ## (e.g. work history is accumulating) + x_frozen <- splitMulti(x_frozen, breaks = frzBreaks, ...) + } + + ## do (also) split where also split + x <- splitMulti(x, breaks = breaks, ...) + breaks <- attr(x, "breaks") ## new breaks appended by splitMulti + + setDT(x) + setDT(x_frozen) + x <- rbindlist(list(x, x_frozen), use.names = TRUE); rm(x_frozen) + forceLexisDT(x, breaks = breaks, allScales = allScales) + if (verbose) cat("Finished splitting data. Time taken: ", timetaken(splitTime), "\n") + } + + ## final touch --------------------------------------------------------------- + + setDT(x) + if (is.character(freezeDummy)) setnames(x, l$frz, freezeDummy) + setkeyv(x, c(l$by, l$order)) + delCols <- setdiff(names(l), c("by", "cutScale", "freezeScales", + "liquidScales", + "linkScales", "allScales", "othScales")) + delCols <- unlist(l[delCols]) + setcolsnull(x, keep = names(lex), colorder = TRUE) + + setattr(x, "time.scales", allScales) + setattr(x, "breaks", breaks) + setattr(x, "time.since", rep("", length(allScales))) + setattr(x, "class", c("Lexis", "data.table", "data.frame")) + if (!return_DT()) setDFpe(x) + + if (verbose) cat("Finished prepExpo run. Time taken: ", timetaken(allTime), "\n") + + x[] +} + + + +doComparisonWithEpi <- function(lexDT, lexDTdrop, lexDF, breaks) { + BL <- NULL + if (!is.list(breaks)) stop("breaks needs to be a list") + requireNamespace("Epi") + requireNamespace("testthat") + + allScales <- attr(lexDF, "time.scales") + sc1 <- allScales[1] + setDT(lexDT) + setDT(lexDTdrop) + setDT(lexDF) + setkeyv(lexDT, c("lex.id", sc1)) + setkeyv(lexDTdrop, c("lex.id", sc1)) + setkeyv(lexDF, c("lex.id", sc1)) + + testthat::expect_equal(attr(lexDT, "time.scales"), attr(lexDF, "time.scales")) + testthat::expect_equal(attr(lexDT, "time.since"), attr(lexDF, "time.since")) + + testthat::expect_equal(attr(lexDTdrop, "time.scales"), attr(lexDF, "time.scales")) + testthat::expect_equal(attr(lexDTdrop, "time.since"), attr(lexDF, "time.since")) + + doTestBarrage(dt1 = lexDT, dt2 = lexDF, allScales = allScales) + rm(lexDT) + + lexDF <- intelliDrop(x = lexDF, breaks = breaks) + + doTestBarrage(dt1 = lexDTdrop, dt2 = lexDF, allScales = allScales) + +} + +doTestBarrage <- function(dt1, dt2, allScales, testTimes = TRUE, testStatuses = TRUE) { + requireNamespace("Epi") + requireNamespace("testthat") + + lex.id <- lex.dur <- NULL ## APPEASE R CMD CHECK + + testthat::expect_equal(sum(dt1$lex.dur), + sum(dt2$lex.dur), + check.attributes = FALSE) + testthat::expect_equal(dt1[, sum(lex.dur), keyby = lex.id]$V1, + dt2[, sum(lex.dur), keyby = lex.id]$V1, + check.attributes = FALSE) + + all_names_present(dt1, allScales) + all_names_present(dt2, allScales) + + if (testTimes) { + for (k in allScales) { + testthat::expect_equal(dt1[[k]], dt2[[k]], + check.attributes = TRUE) + } + } + + if (testStatuses) { + testthat::expect_equal(dt1$lex.Cst, dt2$lex.Cst, check.attributes = FALSE) + testthat::expect_equal(dt1$lex.Xst, dt2$lex.Xst, check.attributes = FALSE) + + testthat::expect_equal(levels(dt1$lex.Cst), levels(dt2$lex.Cst), check.attributes = FALSE) + testthat::expect_equal(levels(dt1$lex.Xst), levels(dt2$lex.Xst), check.attributes = FALSE) + + testthat::expect_true(all(class(dt2$lex.Cst) %in% class(dt1$lex.Cst))) + testthat::expect_true(all(class(dt2$lex.Xst) %in% class(dt1$lex.Xst))) + } + + invisible(NULL) +} + +compareSLDTWithEpi <- function(data, breaks, timeScale) { + requireNamespace("Epi") + requireNamespace("testthat") + + if (!inherits(data, "Lexis")) stop("data gotta be a Lexis object broseph") + + lexDT <- splitLexisDT(data, breaks = breaks, timeScale = timeScale, merge = TRUE, drop = FALSE) + lexDTdrop <- splitLexisDT(data, breaks = breaks, timeScale = timeScale, merge = TRUE, drop = TRUE) + lexDF <- splitLexis(data, breaks = breaks, time.scale = timeScale) ## without dropping + ## this treatment done in splitLexisDT (difftime -> integer -> double) + harmonizeNumericTimeScales(lexDF, times = c(Epi::timeScales(lexDF), "lex.dur")) + + BL <- list(breaks) + setattr(BL, "names", timeScale) + + doComparisonWithEpi(lexDT = lexDT, lexDTdrop = lexDTdrop, lexDF = lexDF, breaks = BL) + + invisible(NULL) +} + + + + + +splitMultiEpi <- function(data, breaks = list(fot = 0:5), drop) { + + for (k in names(breaks)) { + data <- splitLexis(data, breaks = breaks[[k]], time.scale = k) + } + + forceLexisDT( + data, breaks = attr(data, "breaks"), + allScales = attr(data, "time.scales"), + key = FALSE + ) + if (drop) data <- intelliDrop(data, breaks = breaks) + data +} + + + + + +compareSMWithEpi <- function(data, breaks = list(fot=0:5)) { + requireNamespace("Epi") + requireNamespace("testthat") + + lexDT <- splitMulti(data, breaks = breaks, merge = TRUE, drop = FALSE) + lexDTdrop <- splitMulti(data, breaks = breaks, merge = TRUE, drop = TRUE) + lexDF <- splitMultiEpi(data, breaks = breaks, drop = FALSE) + + doComparisonWithEpi(lexDT=lexDT, lexDTdrop = lexDTdrop, lexDF=lexDF, breaks = breaks) + + invisible(NULL) +} + + + + + +summarize_Lexis <- function(x) { + + lex.Cst <- lex.Xst <- NULL ## appease R CMD CHECK + + dur <- sum(x$lex.dur) + status_vars <- paste0("lex.", c("Cst", "Xst")) + time_scales <- copy(attr(x, "time.scales")) + dt <- copy(setDT( + mget(c(status_vars, "lex.id", time_scales[1]), as.environment(x)) + )) + setkeyv(dt, c("lex.id", time_scales[1])) + dt <- unique(dt, by = c("lex.id"), fromLast = TRUE) + + n <- dt[, .N, keyby = status_vars] + rm("dt") + n[, "transition" := paste0(lex.Cst, "->", lex.Xst)] + n <- cast_simple(data = n, columns = "transition", values = "N") + return(cbind(lex.dur = dur, n)) +} + + + + + +roll_lexis_status_inplace <- function(unsplit.data, split.data, id.var) { + + ## R CMD CHECK appeasement + lex.Cst <- lex.Xst <- NULL + + stopifnot( + is.data.table(split.data), + length(key(split.data)) > 1, + key(split.data)[1] == id.var, + key(split.data)[2] %in% attr(unsplit.data, "time.scales"), + id.var %in% names(unsplit.data), + id.var %in% names(split.data), + uniqueN(unsplit.data[[id.var]]) == nrow(unsplit.data) + ) + + status_vars <- c("lex.Cst", "lex.Xst") + status_ud <- mget_cols(c(id.var, status_vars), unsplit.data) + + join <- structure(list(split.data[[id.var]]), names = id.var) + lex_cst <- status_ud[ + i = join, + j = lex.Cst, + on = id.var + ] + storage.mode(lex_cst) <- storage.mode(split.data[["lex.Cst"]]) + set(split.data, j = status_vars, value = list(lex_cst, lex_cst)) + + wh_last_row <- which(!duplicated(split.data, by = id.var, fromLast = TRUE)) + join <- structure(list(split.data[[id.var]][wh_last_row]), names = id.var) + last_lex_xst <- status_ud[ + i = join, + j = lex.Xst, + on = id.var + ] + storage.mode(last_lex_xst) <- storage.mode(split.data[["lex.Xst"]]) + set(split.data, i = wh_last_row, j = "lex.Xst", value = last_lex_xst) + + + NULL +} + + + + +random_splitting_on <- function( + lex, + n.max.breaks = 20 +) { + stopifnot( + inherits(lex, "Lexis") + ) + + ts_nms <- attr(lex, "time.scales") + brks <- attr(lex, "breaks") + timesince <- attr(lex, "time.since") + lex_vars <- c(paste0("lex.", c("id", "Cst", "Xst", "dur")), ts_nms) + non_lex_vars <- setdiff(names(lex), lex_vars) + lex <- mget_cols(lex_vars, lex) + setattr(lex, "time.scales", ts_nms) + setattr(lex, "breaks", brks) + setattr(lex, "time.since", timesince) + setattr(lex, "class", c("Lexis", "data.table", "data.frame")) + checkLexisData(lex) + + n_split_ts <- sample(seq_along(ts_nms), 1) + split_ts_nms <- sample(ts_nms, size = n_split_ts) + + do_drop <- sample(list(FALSE, TRUE), size = 1)[[1]] + + bl <- lapply(split_ts_nms, function(split_ts_nm) { + r <- c(min(lex[[split_ts_nm]]), max(lex[[split_ts_nm]] + lex[["lex.dur"]])) + d <- diff(r) + + br_r <- if (do_drop) 2:n.max.breaks else 1:n.max.breaks + n_br <- sample(br_r, 1) + + ## allow breaks outside observed data, but at least one break must be + ## not outside range of values in data + extrema <- r + c(-1,1)*d*0.05 + l <- rep(extrema[1], n_br) + u <- rep(extrema[2], n_br) + u[1] <- l[1] <- mean(r) + sort(unique(runif(min = l, max = u, n = n_br))) + }) + names(bl) <- split_ts_nms + + es <- ps <- lex + for (ts_nm in split_ts_nms) { + es <- Epi::splitLexis(es, breaks = bl[[ts_nm]], time.scale = ts_nm) + forceLexisDT(es, breaks = attr(es, "breaks"), allScales = ts_nms, + key = FALSE) + if (do_drop) { + es <- intelliDrop(es, breaks = bl[ts_nm]) + } + ps <- splitLexisDT(ps, breaks = bl[[ts_nm]], timeScale = ts_nm, + drop = do_drop) + } + + psm <- splitMulti(lex, breaks = bl, drop = do_drop) + + list(es = es, ps = ps, psm = psm) +} + + + + + +random_Lexis <- function( + n.rows = c(100, 1000, 2000), + n.time.scales = 1:10, + n.statuses = 2:10, + n.other.vars = 1 +) { + + row_n <- sample(as.list(n.rows), 1)[[1]] + + ts_n <- sample(as.list(n.time.scales), 1)[[1]] + + st_n <- sample(as.list(n.statuses), 1)[[1]] + + dt <- setDT(lapply(1:ts_n, function(i) { + runif(min = 0, max = 1000, n = row_n) + })) + ts_nms <- paste0("lex_ts_", formatC(seq_len(ncol(dt)), flag = "0", width = 3)) + setnames(dt, names(dt), ts_nms) + + dt[, "lex.Cst" := sample(1:st_n, size = .N, replace = TRUE)] + dt[, "lex.Xst" := sample(1:st_n, size = .N, replace = TRUE)] + dt[, "lex.id" := sample(1:.N, .N, replace = FALSE)] + dt[, "lex.dur" := runif(n = .N, min = 0, max = 10)] + + oth_n <- sample(as.list(n.other.vars), 1)[[1]] + lapply(seq_len(oth_n), function(i) { + set( + dt, j = makeTempVarName(names = names(dt), pre = "nonlexvar_"), + value = sample(1:100, size = nrow(dt), replace = TRUE) + ) + }) + + brks <- lapply(ts_nms, function(nm) NULL) + names(brks) <- ts_nms + + forceLexisDT(dt, breaks = brks, allScales = ts_nms, key = TRUE) + checkLexisData(dt, check.breaks = TRUE) + dt[] +} + + + + + +random_splitting_on_random_data <- function( + n.datasets = 100, + n.rows = 1000, + n.time.scales = 1:10, + n.breaks = 10:100, + n.statuses = 1:5, + n.other.vars = 1 +) { + + neql <- vector("list", n.datasets) + + for (i in 1:n.datasets) { + + + set.seed(get_random_seed()) + + drop <- sample(list(TRUE, FALSE), 1)[[1]] + drop <- FALSE + + dt <- random_Lexis( + n.rows = n.rows, + n.time.scales = n.time.scales, + n.statuses = n.statuses, + n.other.vars = n.other.vars + ) + + ts_names <- copy(attr(dt, "time.scales")) + ts_n <- length(ts_names) + + dt_bl <- lapply(ts_names, function(x) NULL) + names(dt_bl) <- ts_names + forceLexisDT(dt, breaks = dt_bl, allScales = ts_names) + + br_n <- unlist(sample(as.list(n.breaks), ts_n)) + names(br_n) <- ts_names + BL <- lapply(ts_names, function(ts_name) { + runif(n = br_n[[ts_name]], min = -100, max = 100) + }) + names(BL) <- ts_names + + BL <- BL[sample(ts_names, ts_n)] + + split_pop <- splitMulti(dt, breaks = BL, drop = drop) + split_epi <- splitMultiEpi(dt, breaks = BL, drop = drop) + + setkeyv(split_epi, c("lex.id", ts_names[1])) + setkeyv(split_pop, c("lex.id", ts_names[1])) + + summary_epi <- summarize_Lexis(split_epi) + summary_pop <- summarize_Lexis(split_pop) + + eq <- all.equal(summary_pop, summary_epi, check.attributes = FALSE) + if (!isTRUE(eq)) { + message("split_epi, split_pop not equal in tick ", i, "") + neql[[i]] <- mget(c( + "drop", "row_n", "ts_n", "ts_names", + "dt", "dt_bl", "br_n", "BL", + "split_epi", "split_pop", "eq", + "summary_epi", "summary_pop", + "used_seed" + )) + } + } + + neql[vapply(neql, is.null, logical(1))] + neql +} + + + + + +do_split <- function(x, ts, all.ts, breaks, drop = TRUE, merge = TRUE) { + + ## unfinished v2 splitlexisDT work horse + stopifnot( + is.integer(x[["lex.id"]]) + ) + + id_dt <- data.table( + "orig" = x[["lex.id"]], + "temp" = 1:nrow(x) + ) + set(x, j = "lex.id", value = id_dt[["temp"]]) + + split <- mget_cols(c(ts, "lex.id", "lex.dur"), x) + + BL <- structure(list(breaks), names = ts) + if (drop) { + split <- intelliCrop(split, breaks = BL, allScales = all.ts, + cropStatuses = TRUE) + split <- intelliDrop(x = split, breaks = BL, + check = FALSE, dropNegDur = TRUE) + } + n_subjects <- nrow(split) + ts_values <- split[[ts]] + + split <- rbindlist(lapply(1:length(breaks), function(i) get("split"))) + + tmp_ie_nm <- makeTempVarName(names = names(x), pre = "do_split_tmp_ie_") + + set(split, j = tmp_ie_nm, value = rep(breaks, each = n_subjects)) + set(split, j = tmp_ie_nm, value = { + pmin(split[[tmp_ie_nm]], split[[ts]] + split[["lex.dur"]]) + }) + set(split, j = ts, value = c( + ts_values, + pmax(ts_values, rep(breaks[-length(breaks)], each = n_subjects)) + )) + +} + + + + + diff --git a/R/startup_message.R b/R/startup_message.R index 9e67375..d01b233 100644 --- a/R/startup_message.R +++ b/R/startup_message.R @@ -1,32 +1,53 @@ - -.onAttach <- function(...) { - if (interactive()) { - msg <- paste0("Using popEpi. See news(package='popEpi') for changes. \n", - " popEpi's appropriate data outputs are in data.table ", - "(enhanced data.frame) format \n", - " by default; see ?popEpi for changing this. \n", - " *** IMPORTANT FIXES \n", - " - there was an error in survtab's adjusted outputs ", - "in versions <= 0.4.1 \n", - " leading to inflated confidence intervals; see ", - "news(package='popEpi')\n", - " - splitMulti/splitLexisDT pre-0.4.4 sometimes ", - "produced duplicated transitions\n", - " when splitting along multiple ", - "time scales; see news(package='popEpi')") - packageStartupMessage(msg) - } -} - -.onLoad <- function(...) { - opt <- getOption("popEpi.datatable") - if (!is.null(opt) && is.logical(opt) && !isTRUE(opt)) { - warning("Option 'popEpi.datatable' was set to TRUE when loading popEpi.", - call. = FALSE) - } - options("popEpi.datatable" = TRUE) - -} - - - + + + + + +.onAttach <- function(...) { + + if (interactive()) { + msg <- paste0("Using popEpi. See news(package='popEpi') for changes. \n", + " popEpi's appropriate data outputs are in data.table ", + "(enhanced data.frame) format \n", + " by default; see ?popEpi for changing this. \n", + " *** IMPORTANT FIXES \n", + " - there was an error in survtab's adjusted outputs ", + "in versions <= 0.4.1 \n", + " leading to inflated confidence intervals; see ", + "news(package='popEpi')\n", + " - splitMulti/splitLexisDT pre-0.4.4 sometimes ", + "produced duplicated transitions\n", + " when splitting along multiple ", + "time scales; see news(package='popEpi')") + packageStartupMessage(msg) + } + + using_r_devel <- grepl(pattern = "devel", x = R.version$status) + if (using_r_devel) { + ## memory leak problem in data.table 1.11.2 in R-devel (3.6.0 atm) + requireNamespace("data.table") + data.table::setDTthreads(threads = 1L) + } +} + + + + + +.onLoad <- function(...) { + opt <- getOption("popEpi.datatable") + if (!is.null(opt) && is.logical(opt) && !isTRUE(opt)) { + warning("Option 'popEpi.datatable' was set to TRUE when loading popEpi.", + call. = FALSE) + } + options("popEpi.datatable" = TRUE) + +} + + + + + + + + diff --git a/R/survival_aggregated.R b/R/survival_aggregated.R index c12821c..dbc833d 100644 --- a/R/survival_aggregated.R +++ b/R/survival_aggregated.R @@ -1,890 +1,890 @@ -#' @template survival_doc_template -#' @param formula a \code{formula}; the response -#' must be the time scale to compute survival time function estimates -#' over, e.g. \code{fot ~ sex}. Variables on the right-hand side of the formula -#' separated by \code{+} are considered stratifying variables, for which -#' estimates are computed separately. May contain usage of \code{adjust()} -#' --- see Details and Examples. -#' @param data since popEpi 0.4.0, a \code{data.frame} -#' containing variables used in \code{formula} and other arguments. -#' \code{aggre} objects are recommended as they contain information on any -#' time scales and are therefore safer; for creating \code{aggre} objects see -#' \code{\link{as.aggre}} when your data is already aggregated and \code{aggre} -#' for aggregating split \code{Lexis} objects. -#' -#' @param surv.breaks a vector of breaks on the -#' survival time scale. Optional if \code{data} is an \code{aggre} object -#' and mandatory otherwise. Must define each intended interval; -#' e.g. \code{surv.breaks = 0:5} when data has intervals defined by -#' breaks \code{seq(0, 5, 1/12)} will aggregate to wider intervals first. -#' It is generally recommended (and sufficient; -#' see Seppa, Dyban and Hakulinen (2015)) to use monthly -#' intervals where applicable. -#' -#' @param n variable containing counts of subjects at-risk at the start of a -#' time interval; e.g. \code{n = "at.risk"}. -#' Required when \code{surv.method = "lifetable"}. -#' \link[=flexible_argument]{Flexible input}. -#' -#' @param d variable(s) containing counts of subjects experiencing an event. -#' With only one type of event, e.g. \code{d = "deaths"}. With multiple types of -#' events (for CIF or cause-specific survival estimation), supply e.g. -#' \code{d = c("canD", "othD")}. If the survival time function to be estimated -#' does not use multiple types of events, supplying more than one variable -#' to \code{d} simply causes the variables to be added together. -#' Always required. \link[=flexible_argument]{Flexible input}. -#' -#' @param n.cens variable containing counts of subjects censored during a -#' survival time interval; E.g. \code{n.cens = "alive"}. -#' Required when \code{surv.method = "lifetable"}. -#' \link[=flexible_argument]{Flexible input}. - -#' @param pyrs variable containing total subject-time accumulated within a -#' survival time interval; E.g. \code{pyrs = "pyrs"}. -#' Required when \code{surv.method = "hazard"}. Flexible input. - -#' @param d.exp variable denoting total "expected numbers of events" -#' (typically computed \code{pyrs * pop.haz}, where -#' \code{pop.haz} is the expected hazard level) -#' accumulated within a survival time interval; E.g. \code{pyrs = "pyrs"}. -#' Required when computing EdererII relative survivals or -#' CIFs based on excess counts of events. Flexible input. - -#' @param n.pp variable containing total Pohar-Perme weighted counts of -#' subjects at risk in an interval, -#' supplied as argument \code{n} is supplied. -#' Computed originally on the subject -#' level as analogous to \code{pp * as.integer(status == "at-risk")}. -#' Required when \code{relsurv.method = "pp"}. Flexible input. -#' -#' @param d.pp variable(s) containing Pohar-Perme weighted counts of events, -#' supplied as argument \code{d} is supplied. Computed originally on the subject -#' level as analogous to \code{pp * as.integer(status == some_event)}. -#' Required when \code{relsurv.method = "pp"}. Flexible input. - -#' @param d.pp.2 variable(s) containing total Pohar-Perme -#' "double-weighted" counts of events, -#' supplied as argument \code{d} is supplied. Computed originally on the subject -#' level as analogous to \code{pp * pp * as.integer(status == some_event)}. -#' Required when \code{relsurv.method = "pp"}. Flexible input. - -#' @param n.cens.pp variable containing total Pohar-Perme weighted counts -#' censorings, -#' supplied as argument \code{n.cens} is supplied. -#' Computed originally on the subject -#' level as analogous to \code{pp * as.integer(status == "censored")}. -#' Required when \code{relsurv.method = "pp"}. Flexible input. - -#' @param pyrs.pp variable containing total Pohar-Perme weighted subject-times, -#' supplied as argument \code{pyrs} is supplied. -#' Computed originally on the subject -#' level as analogous to \code{pp * pyrs}. -#' Required when \code{relsurv.method = "pp"}. Flexible input. - -#' @param d.exp.pp variable containing total Pohar-Perme weighted counts -#' of excess events, -#' supplied as argument \code{pyrs} is supplied. -#' Computed originally on the subject -#' level as analogous to \code{pp * d.exp}. -#' Required when \code{relsurv.method = "pp"}. Flexible input. -#' -#' -#' @section Data requirements: -#' -#' \code{survtab_ag} computes estimates of survival time functions using -#' pre-aggregated data. For using subject-level data directly, use -#' \code{\link{survtab}}. For aggregating data, see \code{\link{lexpand}} -#' and \code{\link{aggre}}. -#' -#' By default, and if data is an \code{aggre} object (not mandatory), -#' \code{survtab_ag} makes use of the exact same breaks that were used in -#' splitting the original data (with e.g. \code{lexpand}), so it is not -#' necessary to specify any \code{surv.breaks}. If specified, the -#' \code{surv.breaks} must be a subset of the pertinent -#' pre-existing breaks. When data is not an \code{aggre} object, breaks -#' must always be specified. Interval lengths (\code{delta} in output) are -#' also calculated based on whichever breaks are used, -#' so the upper limit of the breaks should -#' therefore be meaningful and never e.g. \code{Inf}. -#' -#' -#' @examples -#' ## see more examples with explanations in vignette("survtab_examples") -#' -#' #### survtab_ag usage -#' -#' data("sire", package = "popEpi") -#' ## prepare data for e.g. 5-year "period analysis" for 2008-2012 -#' ## note: sire is a simulated cohort integrated into popEpi. -#' BL <- list(fot=seq(0, 5, by = 1/12), -#' per = c("2008-01-01", "2013-01-01")) -#' x <- lexpand(sire, birth = bi_date, entry = dg_date, exit = ex_date, -#' status = status %in% 1:2, -#' breaks = BL, -#' pophaz = popmort, -#' aggre = list(fot)) -#' -#' ## calculate relative EdererII period method -#' ## NOTE: x is an aggre object here, so surv.breaks are deduced -#' ## automatically -#' st <- survtab_ag(fot ~ 1, data = x) -#' -#' summary(st, t = 1:5) ## annual estimates -#' summary(st, q = list(r.e2 = 0.75)) ## 1st interval where r.e2 < 0.75 at end -#' \dontrun{ -#' plot(st) -#' -#' -#' ## non-aggre data: first call to survtab_ag would fail -#' df <- data.frame(x) -#' # st <- survtab_ag(fot ~ 1, data = x) -#' st <- survtab_ag(fot ~ 1, data = x, surv.breaks = BL$fot) -#' -#' ## calculate age-standardised 5-year relative survival ratio using -#' ## Ederer II method and period approach -#' -#' sire$agegr <- cut(sire$dg_age,c(0,45,55,65,75,Inf),right=F) -#' BL <- list(fot=seq(0, 5, by = 1/12), -#' per = c("2008-01-01", "2013-01-01")) -#' x <- lexpand(sire, birth = bi_date, entry = dg_date, exit = ex_date, -#' status = status %in% 1:2, -#' breaks = BL, -#' pophaz = popmort, -#' aggre = list(agegr, fot)) -#' -#' ## age standardisation using internal weights (age distribution of -#' ## patients diagnosed within the period window) -#' ## (NOTE: what is done here is equivalent to using weights = "internal") -#' w <- aggregate(at.risk ~ agegr, data = x[x$fot == 0], FUN = sum) -#' names(w) <- c("agegr", "weights") -#' -#' st <- survtab_ag(fot ~ adjust(agegr), data = x, weights = w) -#' plot(st, y = "r.e2.as", col = c("blue")) -#' -#' ## age standardisation using ICSS1 weights -#' data(ICSS) -#' cut <- c(0, 45, 55, 65, 75, Inf) -#' agegr <- cut(ICSS$age, cut, right = FALSE) -#' w <- aggregate(ICSS1~agegr, data = ICSS, FUN = sum) -#' names(w) <- c("agegr", "weights") -#' -#' st <- survtab_ag(fot ~ adjust(agegr), data = x, weights = w) -#' lines(st, y = "r.e2.as", col = c("red")) -#' -#' -#' ## cause-specific survival -#' sire$stat <- factor(sire$status, 0:2, c("alive", "canD", "othD")) -#' x <- lexpand(sire, birth = bi_date, entry = dg_date, exit = ex_date, -#' status = stat, -#' breaks = BL, -#' pophaz = popmort, -#' aggre = list(agegr, fot)) -#' st <- survtab_ag(fot ~ adjust(agegr), data = x, weights = w, -#' d = c("fromalivetocanD", "fromalivetoothD"), -#' surv.type = "surv.cause") -#' plot(st, y = "surv.obs.fromalivetocanD.as") -#' lines(st, y = "surv.obs.fromalivetoothD.as", col = "red") -#' -#' -#' } -#' @export -survtab_ag <- function(formula = NULL, - - data, - - adjust = NULL, - weights = NULL, - - surv.breaks = NULL, - - n = "at.risk", - d = "from0to1", - n.cens = "from0to0", - pyrs = "pyrs", - d.exp = "d.exp", - - n.pp = NULL, - d.pp = "d.pp", - d.pp.2 = "d.pp.2", - n.cens.pp = "n.cens.pp", - pyrs.pp = "pyrs.pp", - d.exp.pp = "d.exp.pp", - - surv.type="surv.rel", - surv.method="hazard", - relsurv.method="e2", - - subset = NULL, - - conf.level = 0.95, - conf.type = "log-log", - - verbose=FALSE) { - - if (verbose) starttime <- proc.time() - - Tstop <- delta <- Tstart <- surv.int <- n.eff <- n.eff.pp <- surv.obs <- - lag1_surv.obs <- p.obs <- CIF.rel <- NULL ## APPEASE R CMD CHECK - - TF <- environment() - PF <- parent.frame(1L) - - this_call <- match.call() - used_args <- as.list(this_call)[-1L] - fl <- formals("survtab_ag") - used_args <- c(used_args, fl[!names(fl) %in% names(used_args)]) - used_args <- used_args[names(fl)] - rm(fl) - - attrs <- copy(attributes(data)) - - # check data ----------------------------------------------------------------- - if (missing(data) || nrow(data) == 0) stop("data missing or has no rows") - - # check arguments ------------------------------------------------------------ - - surv.type <- match.arg(surv.type, c("surv.obs","surv.rel","surv.cause", "cif.obs", "cif.rel")) - surv.method <- match.arg(surv.method, c("lifetable","hazard")) - relsurv.method <- match.arg(relsurv.method, c("e2", "pp", "EdererII", "Pohar-Perme", "pohar-perme", "edererII", "ederer2")) - if (relsurv.method %in% c("EdererII", "edererII", "ederer2")) relsurv.method <- "e2" - if (relsurv.method %in% c("Pohar-Perme", "pohar-perme")) relsurv.method <- "pp" - relsurv.method <- match.arg(relsurv.method, c("e2", "pp")) - conf.type <- match.arg(conf.type, c("log","log-log","plain")) - - - ## argument 'formula' pre-check ---------------------------------------------- - if (!(inherits(formula, "formula") && length(formula) == 3L)) { - stop("Argument 'formula' does not appear to be a two-sided formula. ", - "Usage: e.g. fot ~ sex") - } - surv.scale <- deparse(formula[[2]]) - if (!surv.scale %in% names(data)) { - stop("Left-hand-side of formula must be a column in data; e.g. ", - "fot ~ sex, where 'fot' is the name of a column in data.") - } - - ## check breaks -------------------------------------------------------------- - - surv.breaks <- select_breaks(data = data, ts = surv.scale, br = surv.breaks) - surv.breaks <- sort(unique(surv.breaks)) - # if (!breaks_in_data(surv.breaks, surv.scale, data)) { - # stop("Used breaks do not all appear to exist in data. Make sure the ", - # "breaks match to the values that your time scale variable has in the ", - # "data.") - # } - - # data prep & subsetting ----------------------------------------------------- - subset <- substitute(subset) - subset <- evalLogicalSubset(data, subset) - - origData <- data - - data <- data[subset, ] - setDT(data) - - # handle count etc. variables ------------------------------------------------ - - valVars <- c("d") - valVars <- c(valVars, if (surv.method == "hazard") "pyrs" else c("n", "n.cens")) - - valVars <- c(valVars, if (surv.type == "surv.rel" && relsurv.method == "e2") "d.exp" else NULL) - - valVars <- c(valVars, if (surv.type == "cif.rel") "d.exp" else NULL) - - ppVars <- c("d.pp", "d.exp.pp", "d.pp.2", - if (surv.method == "hazard") "pyrs.pp" else c("n.cens.pp", "n.pp")) - valVars <- c(valVars, if (surv.type == "surv.rel" && relsurv.method == "pp") ppVars else NULL) - - fo <- formals("survtab_ag") - mc <- as.list(match.call())[-1] - mc <- c(mc, fo[!names(fo) %in% names(mc)]) - - mc <- mc[valVars] - - - mc <- lapply(mc, function(elem) { - evalPopArg(data = data, arg = elem, DT = TRUE, enclos = PF, recursive = TRUE) - }) - - - ## if given as multiple vars, combine these into one (e.g. n = c("v1", "v2")) - combValVars <- c("n", "n.cens", "d.exp", "d.pp", "d.exp.pp", "d.pp.2", - "pyrs", "pyrs.pp", "n.cens.pp", "n.pp") - combValVars <- intersect(names(mc), combValVars) - mc[combValVars] <- lapply(combValVars, function(val_var) { - tab <- mc[[val_var]] - if (!is.data.frame(tab) || length(tab) == 1L) return(tab) - tab_na <- names(tab) - e <- paste0(tab_na, collapse = " + ") - e <- parse(text = e) - tab <- data.table(V1 = tab[, eval(e)]) - setnames(tab, "V1", val_var) - tab - }) - - ## NOTE: this does not delete but sets the value to NULL. - mc[unlist(lapply(mc, function(x) { - NROW(x) == 0L || is.null(x) || is.language(x) || inherits(x, "try-error") - }))] <- NULL - - lackVars <- setdiff(valVars, names(mc[!unlist(lapply(mc, is.null))])) - if (length(lackVars) > 0) { - stop("Following arguments were NULL or could not be evaluated but are ", - "required: ", paste0("'", lackVars, "'", collapse = ", "), ". ", - "Usual suspects: arguments are NULL or refer to variables that ", - "cannot be found in data.") - } - - eventVars <- NULL - mc[[1]] <- data.table(mc[[1L]]) ## this avoids an exotic error in set(). - nl <- lapply(mc, names) - for (k in 1:length(mc)) { - jay <- argName <- names(mc[k]) - cn <- names(mc[[k]]) - - if (length(cn) > 1) jay <- paste0(jay, ".", cn) ## e.g. d.1, d.2, ... - if (argName %in% c("d")) { - eventVars <- jay - if (surv.type %in% c("surv.cause") && length(cn) == 1L) { - stop("surv.type = 'surv.cause', but only one type of event supplied ", - "via argument 'd'. If you want to compute cause-specific ", - "survivals, please supply multiple types of events via ", - "'d'; otherwise use surv.type = 'surv.obs'") - } else if (length(cn) > 1 && !argName %in% c("d","d.pp", "d.pp.2", "n.pp")) { - stop("'", argName, "' has/evaluates to ", length(cn), - " columns; only 'd', 'd.pp', and 'd'pp.2', 'n.pp' may evaluate ", - "to more than one column of the value arguments") - } - - } - - setnames(mc[[k]], cn, jay) - set(mc[[1]], j = jay, value = mc[[k]]) - nl[[argName]] <- jay - } - mc <- mc[[1]] - - if (!is.null(eventVars)) { - set(mc, j = "d", value = rowSums(mc[, mget(eventVars)])) - valVars <- unique(c(valVars, "d", eventVars)) - } - - - all_names_present(mc, valVars, - msg = paste0("Expected internal temp data to have ", - "variables %%VARS%% at this point, but didn't", - ". This is most likely a bug and should be ", - "reported to pkg maintainer.")) - setcolorder(mc, valVars) - - ## addition: internal weights use n at beginning of first interval - - if (is.character(weights)) { - checkWeights(weights) - if (!"n" %in% valVars) { - n <- substitute(n) - mc$n <- evalPopArg(n, data = data, enclos = PF) - - valVars <- unique(c(valVars, "n")) - - if (is.null(mc$n)) { - - stop("Requested internal weights to be computed and used to standardize ", - "estimates, but argument 'n' not supplied. This is currently ", - "required for computing internal weights (the values of 'n' ", - "in the first interval will be used for this). Please supply 'n' ", - "or supply hand-made weights (preferred for your clarity).") - } - } - - data[, c("n") := mc$n] - - } - - - # making weighted table of aggregated values --------------------------------- - ## NOTE: at-risk counts require special treatment when surv.breaks - ## are a subset of the available breaks: cannot sum at-risk figures! - ## instead should simply pick the value at the start of the - ## (now larger) interval. Will accomplish this by setting values not - ## at the start of an interval to zero and summing anyway. - if (surv.method == "lifetable") { - wh_internal <- list(surv.breaks) - names(wh_internal) <- surv.scale - wh_internal <- data[wh_internal, on = eval(surv.scale), which = TRUE] - wh_internal <- setdiff(1:nrow(data), wh_internal) - mc[wh_internal, intersect(c("n", "n.pp"), names(mc)) := 0L] - } - - ## NOTE: while ssSub will pass the whole column of e.g. fot values, which will - ## not limit the data to e.g. up 5 years of follow-up if original data went - ## further, surv.breaks may be only up to 5 years and will limit the data - ## in makeWeightsDT using a CJ-merge-trick appropriately (via custom.levels). - bl <- list(surv.breaks) - setattr(bl, "names", surv.scale) - - adjust <- evalPopArg(data, adjust, enclos = PF, naming = "model") - - iws <- NULL - if (is.character(weights) && pmatch(weights, c("internal", "cohort"), 0)) { - if (!"n" %in% names(data)) { - stop("Need 'n' specified for when using internal weights: Internal ", - "weights are computed as the counts of subjects at the start of ", - "follow-up.") - } - iws <- makeTempVarName(data, pre = "internal_weights_") - data[, c(iws) := 0.0] - data[data[[surv.scale]] == surv.breaks[1], c(iws) := n] - } - - data <- makeWeightsDT(data = data, values = list(mc), enclos = PF, - print = NULL, formula = formula, adjust = adjust, - by.other = surv.scale, Surv.response = FALSE, - custom.levels = bl, weights = weights, - internal.weights.values = iws, - custom.levels.cut.low = surv.scale) - - allVars <- attr(data, "makeWeightsDT") - allVars[] <- lapply(allVars, function(x) if (length(x) == 0L) NULL else x) - prVars <- allVars$prVars - adVars <- allVars$adVars - # boVars <- allVars$boVars ## this is surv.scale - valVars <- allVars$vaVars - - ## to avoid e.g. 'factor(sex, 1:2)' going bonkers - prVars_orig <- prVars - if (length(prVars) > 0L) { - prVars <- makeTempVarName(names = c(names(data), adVars), - pre = paste0("print_", 1:length(prVars))) - } - adVars_orig <- adVars - if (length(adVars) > 0L) { - adVars <- makeTempVarName(names = c(names(data), prVars), - pre = paste0("print_", 1:length(adVars))) - } - if (length(c(prVars, adVars))) setnames(data, c(prVars_orig, adVars_orig), c(prVars, adVars)) - byVars <- c(prVars, adVars) - - # formulate some needed variables -------------------------------------------- - setkeyv(data, c(byVars, surv.scale)) - data[, "Tstop" := surv.breaks[-1]] - setnames(data, surv.scale, "Tstart") - data[, "delta" := Tstop - Tstart] - data[, "surv.int" := 1:.N, by = eval(byVars)] - setcolorder(data, c(byVars, "surv.int", "Tstart", "Tstop", "delta", valVars, intersect(names(data), "weights"))) - - if (surv.method == "lifetable") { - testEvents <- data[, n - shift(n, n = 1, type = "lead", fill = NA), by = eval(byVars)]$V1 - testEvents <- data$n.cens + data$d - testEvents - - if (sum(abs(testEvents), na.rm = TRUE)) { - on.exit({ - - data[, "n.cens + d - (n-lead1_n)" := testEvents] - wh <- testEvents != 0L - wh <- wh & !is.na(wh) - if (interactive()) { - printSD <- c(byVars, "Tstop", "d", "n", "n.cens", - "n.cens + d - (n-lead1_n)") - print(data[wh, .SD, .SDcols = printSD], top = 5, nrow = 10) - - } - - }, add = TRUE) - - stop("Supplied n.cens and d do not sum to total number of events and ", - "censorings based on n alone. Note that lifetable analysis ", - "is currently not supported for period analysis (or other ", - "comparable limitations of data).", - if (interactive())" See table below and check your variables.") - } - rm(testEvents) - data[, "n.eff" := n - n.cens/2L] - } - - - # compute observed survivals ------------------------------------------------ - if (verbose) ostime <- proc.time() - - if (surv.method=="lifetable") { - comp.st.surv.obs.lif(surv.table = data, surv.by.vars = byVars) - } - if (surv.method=="hazard") { - comp.st.surv.obs.haz(surv.table = data, surv.by.vars = byVars) - } - - data <- comp.st.conf.ints(data, al=1-conf.level, surv="surv.obs", transform = conf.type) - - if (verbose) cat("Time taken by computing observed survivals:", timetaken(ostime), "\n") - - - ## empty surv.int checking --------------------------------------------------- - testVar <- if (surv.method == "lifetable") "n" else "pyrs" - ## sum over adjusting variables - data <- test_empty_surv_ints(data, by = c(prVars, adVars), - show.by = c(prVars_orig, adVars_orig), - sum.over = adVars, - test.var = testVar) - - ## sum over nothing - if (length(adVars) > 0L) { - data <- test_empty_surv_ints(data, by = c(prVars, adVars), - show.by = c(prVars_orig, adVars_orig), - sum.over = NULL, test.var = testVar) - } - - ## if adjusting, crop all estimates by adjusting variables - ## to shortest estimate - if (length(adVars)) { - adLe <- data[, list(min = min(surv.int), max = max(surv.int)), keyby = eval(adVars)] - adLe <- c(max(adLe$min), min(adLe$max)) - data <- data[surv.int %in% `:`(adLe[1L], adLe[2L])] - } - - # create and print table of bad surv.ints ------------------------------------ - - badObsSurv <- data$surv.obs == 0 | is.na(data$surv.obs) - if (sum(badObsSurv)) { - - zerotab <- data[badObsSurv, - list(first.bad.surv.int = min(as.integer(surv.int)), - last.bad.surv.int = max(as.integer(surv.int)), - surv.obs=min(surv.obs)), keyby = eval(byVars)] - - - message("Some cumulative surv.obs were zero or NA:") - if (length(byVars)) setnames(zerotab, c(prVars, adVars), c(prVars_orig, adVars_orig)) - print(zerotab) - if (surv.method == "lifetable" && data[surv.obs == 0, .N] > 0) { - message("NOTE: Zero surv.obs leads to zero relative survivals as well. Adjusting with weights WILL use the zero surv.obs / relative survival values.") - } - - } - rm(badObsSurv) - - # compute cause-specific survivals ------------------------------------------ - if (surv.type == "surv.cause") { - - ## NOTE: these related to adjusting life-table estimates for delayed entry... - # data[, "n.eff" := n - n.cens/2 + n.de/2 + n.de.cens/4] # + d.de/2 - # "n.cens_1" := n.cens + (d-d_1) - # "n.de.cens" := n.de.cens + (d.de - d.de_1) - - if (surv.method == "lifetable") { - for (k in eventVars) { - k <- gsub(pattern = "d_", replacement = "", x = k) - d_k <- paste0("d_", k) - # d.de_k <- paste0("d.de_",k) - - n.eff_k <- paste0("n.eff_",k) - - ## old: " := n - (n.cens + (d-", d_k,")/2 + n.de/2 + (n.de.cens + d.de - ", d.de_k,")/4 )" - # expr <- paste0(n.eff_k, " := n - (n.cens + (d-", d_k,")/2 )") - - set(data, j = c(n.eff_k), value = data$n.eff + (data$d - data[[d_k]])/2L ) # + d.de/2 - # data[, eval(parse(text = expr), envir = .SD)] - - } - - } - - surv_names <- names(data)[grep("surv.obs", names(data))] - surv_names <- c("d", if (surv.method == "lifetable") "n.eff" else NULL, surv_names) - setnames(data, surv_names, paste0(surv_names, ".orig")) - - for (k in eventVars) { - - k <- gsub(pattern = "d.", replacement = "", x = k) - setnames(data, paste0("d.",k), "d") - - if (surv.method=="hazard") { - comp.st.surv.obs.haz(surv.table = data, surv.by.vars = byVars) - } else { - setnames(data, paste0("n.eff_", k), "n.eff") - comp.st.surv.obs.lif(surv.table = data, surv.by.vars = byVars) - } - os.table <- comp.st.conf.ints(data, al=1-conf.level, surv="surv.obs", transform = conf.type) - - new_surv_names <- setdiff(surv_names, c("d", if (surv.method == "lifetable") "n.eff" else NULL)) - new_surv_names <- gsub("surv.obs", paste0("surv.obs.", k), new_surv_names) - new_surv_names <- c(paste0(c("d.", if (surv.method == "lifetable") "n.eff." else NULL), k), new_surv_names) - setnames(data, surv_names, new_surv_names) - - - } - setnames(data, paste0(surv_names, ".orig"), surv_names) - } - - # compute cause-specific/excess-case CIFs ------------------------------------ - if (surv.type %in% c("cif.obs", "cif.rel")) { - - data[, "lag1_surv.obs" := shift(surv.obs, n = 1L, type = "lag", fill = 1), by = eval(byVars)] - data[, "p.obs" := surv.obs/lag1_surv.obs] - - if (surv.type == "cif.obs") { - for (k in eventVars) { - - k <- gsub("d.", "", x = k) - d.k <- paste0("d.", k) - - d.var <- paste0("d.",k) - q.var <- paste0("q.", k) - CIF_var <- paste0("CIF_", k) - data[, (q.var) := (1-p.obs)*get(d.var)/d] - data[get(d.var) == 0L | d == 0L, (q.var) := 0] - data[, (CIF_var) := cumsum(lag1_surv.obs*get(q.var)), by = eval(byVars)] - } - } - - if (surv.type == "cif.rel") { - ## assuming d.exp in data - data[, "CIF.rel" := (1-p.obs)*(d-d.exp)/d] - data[d.exp>d, "CIF.rel" := NA] - data[, "CIF.rel" := cumsum(lag1_surv.obs*CIF.rel), by = eval(byVars)] - } - - ## SEs currently not known for CIFs; impute 0 to make adjusting work - CIF_vars <- names(data)[substr(names(data),1,3) == "CIF"] - data[, c(paste0("SE.", CIF_vars)) := 0L] - - setcolsnull(data, c("lag1_surv.obs", "p.obs", paste0("q.", substr(eventVars, 3, nchar(eventVars))))) - - } - - - # relative survivals --------------------------------------------------------- - if (surv.type == "surv.rel" & relsurv.method == "e2") { - - # compute r.e2 ------------------------------------------------------------- - comp.st.rs <- function(rs.table, rs.by.vars = byVars) { - - p.exp <- delta <- surv.exp <- surv.obs <- n.eff.pp <- - surv.obs <- NULL ## APPEASE R CMD CHECK - ## EdererII - - ##------------- - if (surv.method == "hazard") { - rs.table[, "p.exp" := exp(-delta*d.exp/pyrs)] - rs.table[, "surv.exp" := cumprod(p.exp), by = eval(rs.by.vars)] - comp.st.r.e2.haz(surv.table = rs.table, surv.by.vars = rs.by.vars) - } else { - rs.table[, "p.exp" := 1 - d.exp/n] - rs.table[, "surv.exp" := cumprod(p.exp), by = eval(rs.by.vars)] - - if (rs.table[, min(surv.obs, na.rm=T) == 0]) { - rs.table[surv.obs == 0, "surv.exp" := 1] - } - - comp.st.r.e2.lif(surv.table = rs.table, surv.by.vars = rs.by.vars) - - if (rs.table[, min(surv.obs, na.rm=T) == 0]) { - rs.table[surv.obs == 0, intersect(c("surv.exp","r.e2","SE.r.e2","r.e2.lo","r.e2.hi"), names(rs.table)) := 0] - } - } - - ## ------------ - - rs.table <- comp.st.conf.ints(rs.table, al=1-conf.level, surv="r.e2", transform = conf.type) - - return(rs.table) - } - - data <- comp.st.rs(rs.table = data) - - - } - - # compute r.pp --------------------------------------------------------------- - if (surv.type == "surv.rel" & relsurv.method == "pp") { - - all_names_present(data, c("d.pp", "d.exp.pp", "d.pp.2")) - ## pohar perme: analysis weighted by expected cumulative survival - comp.st.pp <- function(pp.table, by.vars = byVars) { - ## relative survival - if (surv.method == "hazard") { - all_names_present(data, c("pyrs.pp"), - msg = paste0("internal error: work data did not have", - " variable named pyrs.pp. Complain ", - "to package maintainer if you see this.")) - comp.st.r.pp.haz(surv.table = pp.table, surv.by.vars = by.vars) - } else { - data[, "n.eff.pp" := n.pp - 0.5*n.cens.pp] - all_names_present(data, c("n.pp", "n.cens.pp", "n.eff.pp"), - msg = paste0("internal error: work data did not have", - " variable named n.eff.pp. Complain ", - "to package maintainer if you see this.")) - comp.st.r.pp.lif(surv.table = pp.table, surv.by.vars = by.vars) - - if (pp.table[, min(surv.obs, na.rm=T) == 0]) { - pp.table[surv.obs == 0, intersect(c("r.pp","SE.r.pp","r.pp.lo","r.pp.hi"), names(pp.table)) := 0] - } - } - - pp.table <- comp.st.conf.ints(pp.table, al=1-conf.level, surv="r.pp", transform = conf.type ) - - return(pp.table) - } - data <- comp.st.pp(pp.table = data) - } - - # compute adjusted estimates ------------------------------------------------- - if ("weights" %in% names(data)) { - w_est_vars <- names(data)[substr(names(data), 1, 8) == "surv.obs"] - w_est_vars <- c(w_est_vars, "r.e2", "r.pp") - w_est_vars <- c(w_est_vars, names(data)[substr(names(data),1,3)=="CIF"]) - w_est_vars <- intersect(w_est_vars, names(data)) - w_est_vars <- w_est_vars[unlist(lapply(w_est_vars, function(x) { - !substr(x, nchar(x)-2L, nchar(x)) %in% c(".lo", ".hi") - }))] - w_se_vars <- paste0("SE.", w_est_vars) - - w_est <- data[, lapply(.SD, function(x) sum(x*weights)), - keyby = c(prVars, "surv.int"), .SDcols = w_est_vars] - w_se <- data[, lapply(.SD, function(x) sqrt(sum((x^2)*(weights^2)))), - keyby = c(prVars, "surv.int"), .SDcols = w_se_vars] - - data <- data[, lapply(mget(valVars), sum), - keyby = c(prVars, "surv.int", "Tstart", "Tstop", "delta")] - set(data, j = w_se_vars, value = mget(w_se_vars, as.environment(w_se))) - set(data, j = w_est_vars, value = mget(w_est_vars, as.environment(w_est))) - - setnames(data, old = c(w_est_vars, w_se_vars), - new = paste0(c(w_est_vars, w_se_vars), ".as")) - - for (var in paste0(w_est_vars, ".as")) { - data <- comp.st.conf.ints(data, al = 1-conf.level, - surv = var, transform = conf.type) - } - - - } - - # clean-up ------------------------------------------------------------------- - ## back to original names of print / adjust (used to avoid e.g. - ## 'factor(V1, 1:2)' going bonkers in data.table) - if (length(c(prVars))) setnames(data, c(prVars), c(prVars_orig)) - prVars <- prVars_orig - adVars <- adVars_orig - - ## reorder table, format numeric values, etc. - - miscVars <- intersect(names(data), c("surv.int", "Tstart", "Tstop", "delta")) - - survVars <- c("surv.obs.lo","surv.obs","surv.obs.hi","SE.surv.obs", - "r.e2.lo","r.e2","r.e2.hi","SE.r.e2", - "r.pp.lo","r.pp","r.pp.hi","SE.r.pp", - paste0("CIF.rel.", c("lo", "", "hi")), "SE.CIF.rel", - "surv.obs.as.lo","surv.obs.as","surv.obs.as.hi","SE.surv.obs.as", - "r.e2.as.lo","r.e2.as","r.e2.as.hi","SE.r.e2.as", - "r.pp.as.lo","r.pp.as","r.pp.as.hi","SE.r.pp.as", - paste0("CIF.rel.as.", c("lo", "", "hi")), "SE.CIF.rel.as" - ) - survVars <- intersect(survVars, names(data)) - - ## which variables are estimates, SEs, CIs, etc. - survVars.ca <- setdiff(names(data), c(prVars, valVars, miscVars, survVars)) - CIF_vars <- survVars.ca[substr(survVars.ca, 1,3)=="CIF" | substr(survVars.ca, 1,6)=="SE.CIF"] - survVars <- c(survVars, CIF_vars) - - surv.obs.vars <- survVars.ca[substr(survVars.ca, 1,8) == "surv.obs" | substr(survVars.ca, 1,11) == "SE.surv.obs"] - survVars <- c(survVars, surv.obs.vars) - - survVars <- unique(intersect(survVars, names(data))) - - ## remove some unuseful variables - setcolsnull(data, c("SE.A", "SE.B")) - setcolsnull(data, survVars[substr(survVars, 1, 6) == "SE.CIF"]) ## since they are zero for now - survVars <- intersect(survVars, names(data)) - - SEVars <- survVars[substr(survVars, 1, 3) == "SE."] - CIVars <- survVars[substr(survVars, nchar(survVars) - 2L, nchar(survVars)) %in% c(".lo", ".hi")] - estVars <- setdiff(survVars, c(SEVars, CIVars)) - - order <- unique(c(prVars, miscVars, valVars, survVars)) - order <- intersect(order, names(data)) - - setcolsnull(data, setdiff(names(data), order)) - setcolorder(data,order) - - setkeyv(data, c(prVars, "surv.int")) - - # attributes ----------------------------------------------------------------- - setkeyv(data, c(prVars, "surv.int")) - setattr(data, "class", c("survtab", "data.table", "data.frame")) - if (!return_DT()) setDFpe(data) - if (length(prVars) == 0) prVars <- NULL ## might be character(0) - - used_args$data <- origData - used_args$formula <- formula - used_args$weights <- evalRecursive(arg = weights, env = PF)$weights - - arglist <- list(call = this_call, - arguments = used_args, - surv.scale = surv.scale, - surv.breaks = surv.breaks, - print.vars = prVars, - adjust.vars = adVars, - value.vars = valVars, - misc.vars = miscVars, - surv.vars = survVars, - est.vars = estVars, - SE.vars = SEVars, - CI.vars = CIVars) - varsArgs <- substr(names(arglist), nchar(names(arglist))-4L, nchar(names(arglist))) == ".vars" - varsArgs <- names(arglist)[varsArgs] - arglist[varsArgs] <- lapply(arglist[varsArgs], function(x) if (length(x) == 0L) NULL else x) - - setattr(data, "survtab.meta", arglist) - - if (verbose) cat("Time taken by whole process: ", timetaken(starttime), "\n") - data[] -} - - -# ag <- lexpand(sire, birth = "bi_date", entry = "dg_date", exit = "ex_date", -# status = status %in% 1:2, pophaz = popmort, pp = TRUE, -# aggre = list(sex, fot), fot = seq(0, 5, 1/12)) -# ag[, d.exp := pmax(0L, from0to1 - 3L)] -# st <- survtab_ag(ag, surv.type = "surv.obs", surv.method = "hazard") -# st <- survtab_ag(ag, surv.type = "surv.cause", surv.method = "hazard", d = list(a = from0to1-3, b = 3)) - -# sire <- copy(sire) -# sire$sex <- rbinom(nrow(sire), size = 1, prob = 0.5) -# ag <- lexpand(sire, birth = "bi_date", entry = "dg_date", exit = "ex_date", -# status = status %in% 1:2, pophaz = popmort, pp = TRUE, -# aggre = list(sex, agegr = cut(dg_age, c(0,60,70,80, Inf), labels = FALSE), fot), -# fot = seq(0, 5, 1/12)) -# ag <- lexpand(sire, birth = "bi_date", entry = "bi_date", exit = "ex_date", -# status = status %in% 1:2, -# aggre = list(sex, age), -# age = seq(0, 100, 1)) -# wdt <- data.table(agegr = 1:4, weights = c(0.2, 0.4, 0.3, 0.1)) -# wli <- list(agegr = c(0.2, 0.4, 0.3, 0.1)) -# st <- survtab_ag(fot ~ sex + adjust(agegr), data = ag, surv.type = "surv.obs", surv.method = "hazard", weights = wli) -# st <- survtab_ag(fot ~ sex + adjust(agegr), data = ag, surv.type = "surv.rel", -# d.pp = "from0to1.pp", d.pp.2 = "from0to1.pp.2", -# d.exp.pp = "d.exp.pp", pyrs.pp = "ptime.pp", -# surv.method = "hazard", weights = wli, -# relsurv.method = "pp") -# ag <- lexpand(sire, birth = "bi_date", entry = "dg_date", exit = "ex_date", -# status = status, pophaz = popmort, pp = TRUE, -# aggre = list(sex, agegr = cut(dg_age, c(0,60,70,80, Inf), labels = FALSE), fot), -# fot = seq(0, 5, 1/12)) -# st <- survtab_ag(fot ~ sex + adjust(agegr), data = ag, -# d = list(cand = from0to1, othd = from0to2), -# surv.type = "surv.cause", weights = wli) -# st <- survtab_ag(fot ~ sex, data = ag, surv.type = "surv.obs", surv.method = "hazard", adjust = "agegr", weights = wli) -# st <- survtab_ag(fot ~ adjust(agegr), data = ag, surv.type = "surv.obs", weights = wli) -# st <- survtab_ag(fot ~ 1, data = ag, adjust = "agegr", surv.type = "surv.obs", weights = wli) -# st <- survtab_ag(fot ~ 1, data = ag, adjust = "agegr", surv.type = "surv.obs", weights = wli) -# st <- survtab_ag(fot ~ 1, data = ag, surv.type = "surv.obs") - -# wli2 <- wli -# wli$sex <- c(0.4, 0.6) -# st <- survtab_ag(fot ~ adjust(sex, agegr), data = ag, surv.type = "surv.obs", weights = wli) -# st <- survtab_ag(fot ~ adjust(agegr), data = ag, surv.type = "surv.obs", weights = wli["agegr"]) -# ag[, d.exp := pmax(from0to1 - 1, 0L)] -# st <- survtab_ag(fot ~ adjust(sex, agegr), data = ag, surv.type = "surv.rel", weights = wli) -# st <- survtab_ag(fot ~ adjust(sex, agegr), data = ag, surv.type = "surv.cause", weights = wli) -# ag[, othd := pmax(from0to1 - 1L, 0L)] -# st <- survtab_ag(fot ~ adjust(sex, agegr), data = ag, d = list(cand = from0to1, othd = pmax(from0to1-1L, 0L)), surv.type = "surv.cause", weights = wli) +#' @template survival_doc_template +#' @param formula a \code{formula}; the response +#' must be the time scale to compute survival time function estimates +#' over, e.g. \code{fot ~ sex}. Variables on the right-hand side of the formula +#' separated by \code{+} are considered stratifying variables, for which +#' estimates are computed separately. May contain usage of \code{adjust()} +#' --- see Details and Examples. +#' @param data since popEpi 0.4.0, a \code{data.frame} +#' containing variables used in \code{formula} and other arguments. +#' \code{aggre} objects are recommended as they contain information on any +#' time scales and are therefore safer; for creating \code{aggre} objects see +#' \code{\link{as.aggre}} when your data is already aggregated and \code{aggre} +#' for aggregating split \code{Lexis} objects. +#' +#' @param surv.breaks a vector of breaks on the +#' survival time scale. Optional if \code{data} is an \code{aggre} object +#' and mandatory otherwise. Must define each intended interval; +#' e.g. \code{surv.breaks = 0:5} when data has intervals defined by +#' breaks \code{seq(0, 5, 1/12)} will aggregate to wider intervals first. +#' It is generally recommended (and sufficient; +#' see Seppa, Dyban and Hakulinen (2015)) to use monthly +#' intervals where applicable. +#' +#' @param n variable containing counts of subjects at-risk at the start of a +#' time interval; e.g. \code{n = "at.risk"}. +#' Required when \code{surv.method = "lifetable"}. +#' \link[=flexible_argument]{Flexible input}. +#' +#' @param d variable(s) containing counts of subjects experiencing an event. +#' With only one type of event, e.g. \code{d = "deaths"}. With multiple types of +#' events (for CIF or cause-specific survival estimation), supply e.g. +#' \code{d = c("canD", "othD")}. If the survival time function to be estimated +#' does not use multiple types of events, supplying more than one variable +#' to \code{d} simply causes the variables to be added together. +#' Always required. \link[=flexible_argument]{Flexible input}. +#' +#' @param n.cens variable containing counts of subjects censored during a +#' survival time interval; E.g. \code{n.cens = "alive"}. +#' Required when \code{surv.method = "lifetable"}. +#' \link[=flexible_argument]{Flexible input}. + +#' @param pyrs variable containing total subject-time accumulated within a +#' survival time interval; E.g. \code{pyrs = "pyrs"}. +#' Required when \code{surv.method = "hazard"}. Flexible input. + +#' @param d.exp variable denoting total "expected numbers of events" +#' (typically computed \code{pyrs * pop.haz}, where +#' \code{pop.haz} is the expected hazard level) +#' accumulated within a survival time interval; E.g. \code{pyrs = "pyrs"}. +#' Required when computing EdererII relative survivals or +#' CIFs based on excess counts of events. Flexible input. + +#' @param n.pp variable containing total Pohar-Perme weighted counts of +#' subjects at risk in an interval, +#' supplied as argument \code{n} is supplied. +#' Computed originally on the subject +#' level as analogous to \code{pp * as.integer(status == "at-risk")}. +#' Required when \code{relsurv.method = "pp"}. Flexible input. +#' +#' @param d.pp variable(s) containing Pohar-Perme weighted counts of events, +#' supplied as argument \code{d} is supplied. Computed originally on the subject +#' level as analogous to \code{pp * as.integer(status == some_event)}. +#' Required when \code{relsurv.method = "pp"}. Flexible input. + +#' @param d.pp.2 variable(s) containing total Pohar-Perme +#' "double-weighted" counts of events, +#' supplied as argument \code{d} is supplied. Computed originally on the subject +#' level as analogous to \code{pp * pp * as.integer(status == some_event)}. +#' Required when \code{relsurv.method = "pp"}. Flexible input. + +#' @param n.cens.pp variable containing total Pohar-Perme weighted counts +#' censorings, +#' supplied as argument \code{n.cens} is supplied. +#' Computed originally on the subject +#' level as analogous to \code{pp * as.integer(status == "censored")}. +#' Required when \code{relsurv.method = "pp"}. Flexible input. + +#' @param pyrs.pp variable containing total Pohar-Perme weighted subject-times, +#' supplied as argument \code{pyrs} is supplied. +#' Computed originally on the subject +#' level as analogous to \code{pp * pyrs}. +#' Required when \code{relsurv.method = "pp"}. Flexible input. + +#' @param d.exp.pp variable containing total Pohar-Perme weighted counts +#' of excess events, +#' supplied as argument \code{pyrs} is supplied. +#' Computed originally on the subject +#' level as analogous to \code{pp * d.exp}. +#' Required when \code{relsurv.method = "pp"}. Flexible input. +#' +#' +#' @section Data requirements: +#' +#' \code{survtab_ag} computes estimates of survival time functions using +#' pre-aggregated data. For using subject-level data directly, use +#' \code{\link{survtab}}. For aggregating data, see \code{\link{lexpand}} +#' and \code{\link{aggre}}. +#' +#' By default, and if data is an \code{aggre} object (not mandatory), +#' \code{survtab_ag} makes use of the exact same breaks that were used in +#' splitting the original data (with e.g. \code{lexpand}), so it is not +#' necessary to specify any \code{surv.breaks}. If specified, the +#' \code{surv.breaks} must be a subset of the pertinent +#' pre-existing breaks. When data is not an \code{aggre} object, breaks +#' must always be specified. Interval lengths (\code{delta} in output) are +#' also calculated based on whichever breaks are used, +#' so the upper limit of the breaks should +#' therefore be meaningful and never e.g. \code{Inf}. +#' +#' +#' @examples +#' ## see more examples with explanations in vignette("survtab_examples") +#' +#' #### survtab_ag usage +#' +#' data("sire", package = "popEpi") +#' ## prepare data for e.g. 5-year "period analysis" for 2008-2012 +#' ## note: sire is a simulated cohort integrated into popEpi. +#' BL <- list(fot=seq(0, 5, by = 1/12), +#' per = c("2008-01-01", "2013-01-01")) +#' x <- lexpand(sire, birth = bi_date, entry = dg_date, exit = ex_date, +#' status = status %in% 1:2, +#' breaks = BL, +#' pophaz = popmort, +#' aggre = list(fot)) +#' +#' ## calculate relative EdererII period method +#' ## NOTE: x is an aggre object here, so surv.breaks are deduced +#' ## automatically +#' st <- survtab_ag(fot ~ 1, data = x) +#' +#' summary(st, t = 1:5) ## annual estimates +#' summary(st, q = list(r.e2 = 0.75)) ## 1st interval where r.e2 < 0.75 at end +#' \dontrun{ +#' plot(st) +#' +#' +#' ## non-aggre data: first call to survtab_ag would fail +#' df <- data.frame(x) +#' # st <- survtab_ag(fot ~ 1, data = x) +#' st <- survtab_ag(fot ~ 1, data = x, surv.breaks = BL$fot) +#' +#' ## calculate age-standardised 5-year relative survival ratio using +#' ## Ederer II method and period approach +#' +#' sire$agegr <- cut(sire$dg_age,c(0,45,55,65,75,Inf),right=F) +#' BL <- list(fot=seq(0, 5, by = 1/12), +#' per = c("2008-01-01", "2013-01-01")) +#' x <- lexpand(sire, birth = bi_date, entry = dg_date, exit = ex_date, +#' status = status %in% 1:2, +#' breaks = BL, +#' pophaz = popmort, +#' aggre = list(agegr, fot)) +#' +#' ## age standardisation using internal weights (age distribution of +#' ## patients diagnosed within the period window) +#' ## (NOTE: what is done here is equivalent to using weights = "internal") +#' w <- aggregate(at.risk ~ agegr, data = x[x$fot == 0], FUN = sum) +#' names(w) <- c("agegr", "weights") +#' +#' st <- survtab_ag(fot ~ adjust(agegr), data = x, weights = w) +#' plot(st, y = "r.e2.as", col = c("blue")) +#' +#' ## age standardisation using ICSS1 weights +#' data(ICSS) +#' cut <- c(0, 45, 55, 65, 75, Inf) +#' agegr <- cut(ICSS$age, cut, right = FALSE) +#' w <- aggregate(ICSS1~agegr, data = ICSS, FUN = sum) +#' names(w) <- c("agegr", "weights") +#' +#' st <- survtab_ag(fot ~ adjust(agegr), data = x, weights = w) +#' lines(st, y = "r.e2.as", col = c("red")) +#' +#' +#' ## cause-specific survival +#' sire$stat <- factor(sire$status, 0:2, c("alive", "canD", "othD")) +#' x <- lexpand(sire, birth = bi_date, entry = dg_date, exit = ex_date, +#' status = stat, +#' breaks = BL, +#' pophaz = popmort, +#' aggre = list(agegr, fot)) +#' st <- survtab_ag(fot ~ adjust(agegr), data = x, weights = w, +#' d = c("fromalivetocanD", "fromalivetoothD"), +#' surv.type = "surv.cause") +#' plot(st, y = "surv.obs.fromalivetocanD.as") +#' lines(st, y = "surv.obs.fromalivetoothD.as", col = "red") +#' +#' +#' } +#' @export +survtab_ag <- function(formula = NULL, + + data, + + adjust = NULL, + weights = NULL, + + surv.breaks = NULL, + + n = "at.risk", + d = "from0to1", + n.cens = "from0to0", + pyrs = "pyrs", + d.exp = "d.exp", + + n.pp = NULL, + d.pp = "d.pp", + d.pp.2 = "d.pp.2", + n.cens.pp = "n.cens.pp", + pyrs.pp = "pyrs.pp", + d.exp.pp = "d.exp.pp", + + surv.type="surv.rel", + surv.method="hazard", + relsurv.method="e2", + + subset = NULL, + + conf.level = 0.95, + conf.type = "log-log", + + verbose=FALSE) { + + if (verbose) starttime <- proc.time() + + Tstop <- delta <- Tstart <- surv.int <- n.eff <- n.eff.pp <- surv.obs <- + lag1_surv.obs <- p.obs <- CIF.rel <- NULL ## APPEASE R CMD CHECK + + TF <- environment() + PF <- parent.frame(1L) + + this_call <- match.call() + used_args <- as.list(this_call)[-1L] + fl <- formals("survtab_ag") + used_args <- c(used_args, fl[!names(fl) %in% names(used_args)]) + used_args <- used_args[names(fl)] + rm(fl) + + attrs <- copy(attributes(data)) + + # check data ----------------------------------------------------------------- + if (missing(data) || nrow(data) == 0) stop("data missing or has no rows") + + # check arguments ------------------------------------------------------------ + + surv.type <- match.arg(surv.type, c("surv.obs","surv.rel","surv.cause", "cif.obs", "cif.rel")) + surv.method <- match.arg(surv.method, c("lifetable","hazard")) + relsurv.method <- match.arg(relsurv.method, c("e2", "pp", "EdererII", "Pohar-Perme", "pohar-perme", "edererII", "ederer2")) + if (relsurv.method %in% c("EdererII", "edererII", "ederer2")) relsurv.method <- "e2" + if (relsurv.method %in% c("Pohar-Perme", "pohar-perme")) relsurv.method <- "pp" + relsurv.method <- match.arg(relsurv.method, c("e2", "pp")) + conf.type <- match.arg(conf.type, c("log","log-log","plain")) + + + ## argument 'formula' pre-check ---------------------------------------------- + if (!(inherits(formula, "formula") && length(formula) == 3L)) { + stop("Argument 'formula' does not appear to be a two-sided formula. ", + "Usage: e.g. fot ~ sex") + } + surv.scale <- deparse(formula[[2]]) + if (!surv.scale %in% names(data)) { + stop("Left-hand-side of formula must be a column in data; e.g. ", + "fot ~ sex, where 'fot' is the name of a column in data.") + } + + ## check breaks -------------------------------------------------------------- + + surv.breaks <- select_breaks(data = data, ts = surv.scale, br = surv.breaks) + surv.breaks <- sort(unique(surv.breaks)) + # if (!breaks_in_data(surv.breaks, surv.scale, data)) { + # stop("Used breaks do not all appear to exist in data. Make sure the ", + # "breaks match to the values that your time scale variable has in the ", + # "data.") + # } + + # data prep & subsetting ----------------------------------------------------- + subset <- substitute(subset) + subset <- evalLogicalSubset(data, subset) + + origData <- data + + data <- data[subset, ] + setDT(data) + + # handle count etc. variables ------------------------------------------------ + + valVars <- c("d") + valVars <- c(valVars, if (surv.method == "hazard") "pyrs" else c("n", "n.cens")) + + valVars <- c(valVars, if (surv.type == "surv.rel" && relsurv.method == "e2") "d.exp" else NULL) + + valVars <- c(valVars, if (surv.type == "cif.rel") "d.exp" else NULL) + + ppVars <- c("d.pp", "d.exp.pp", "d.pp.2", + if (surv.method == "hazard") "pyrs.pp" else c("n.cens.pp", "n.pp")) + valVars <- c(valVars, if (surv.type == "surv.rel" && relsurv.method == "pp") ppVars else NULL) + + fo <- formals("survtab_ag") + mc <- as.list(match.call())[-1] + mc <- c(mc, fo[!names(fo) %in% names(mc)]) + + mc <- mc[valVars] + + + mc <- lapply(mc, function(elem) { + evalPopArg(data = data, arg = elem, DT = TRUE, enclos = PF, recursive = TRUE) + }) + + + ## if given as multiple vars, combine these into one (e.g. n = c("v1", "v2")) + combValVars <- c("n", "n.cens", "d.exp", "d.pp", "d.exp.pp", "d.pp.2", + "pyrs", "pyrs.pp", "n.cens.pp", "n.pp") + combValVars <- intersect(names(mc), combValVars) + mc[combValVars] <- lapply(combValVars, function(val_var) { + tab <- mc[[val_var]] + if (!is.data.frame(tab) || length(tab) == 1L) return(tab) + tab_na <- names(tab) + e <- paste0(tab_na, collapse = " + ") + e <- parse(text = e) + tab <- data.table(V1 = tab[, eval(e)]) + setnames(tab, "V1", val_var) + tab + }) + + ## NOTE: this does not delete but sets the value to NULL. + mc[unlist(lapply(mc, function(x) { + NROW(x) == 0L || is.null(x) || is.language(x) || inherits(x, "try-error") + }))] <- NULL + + lackVars <- setdiff(valVars, names(mc[!unlist(lapply(mc, is.null))])) + if (length(lackVars) > 0) { + stop("Following arguments were NULL or could not be evaluated but are ", + "required: ", paste0("'", lackVars, "'", collapse = ", "), ". ", + "Usual suspects: arguments are NULL or refer to variables that ", + "cannot be found in data.") + } + + eventVars <- NULL + mc[[1]] <- data.table(mc[[1L]]) ## this avoids an exotic error in set(). + nl <- lapply(mc, names) + for (k in 1:length(mc)) { + jay <- argName <- names(mc[k]) + cn <- names(mc[[k]]) + + if (length(cn) > 1) jay <- paste0(jay, ".", cn) ## e.g. d.1, d.2, ... + if (argName %in% c("d")) { + eventVars <- jay + if (surv.type %in% c("surv.cause") && length(cn) == 1L) { + stop("surv.type = 'surv.cause', but only one type of event supplied ", + "via argument 'd'. If you want to compute cause-specific ", + "survivals, please supply multiple types of events via ", + "'d'; otherwise use surv.type = 'surv.obs'") + } else if (length(cn) > 1 && !argName %in% c("d","d.pp", "d.pp.2", "n.pp")) { + stop("'", argName, "' has/evaluates to ", length(cn), + " columns; only 'd', 'd.pp', and 'd'pp.2', 'n.pp' may evaluate ", + "to more than one column of the value arguments") + } + + } + + setnames(mc[[k]], cn, jay) + set(mc[[1]], j = jay, value = mc[[k]]) + nl[[argName]] <- jay + } + mc <- mc[[1]] + + if (!is.null(eventVars)) { + set(mc, j = "d", value = rowSums(mc[, mget(eventVars)])) + valVars <- unique(c(valVars, "d", eventVars)) + } + + + all_names_present(mc, valVars, + msg = paste0("Expected internal temp data to have ", + "variables %%VARS%% at this point, but didn't", + ". This is most likely a bug and should be ", + "reported to pkg maintainer.")) + setcolorder(mc, valVars) + + ## addition: internal weights use n at beginning of first interval + + if (is.character(weights)) { + checkWeights(weights) + if (!"n" %in% valVars) { + n <- substitute(n) + mc$n <- evalPopArg(n, data = data, enclos = PF) + + valVars <- unique(c(valVars, "n")) + + if (is.null(mc$n)) { + + stop("Requested internal weights to be computed and used to standardize ", + "estimates, but argument 'n' not supplied. This is currently ", + "required for computing internal weights (the values of 'n' ", + "in the first interval will be used for this). Please supply 'n' ", + "or supply hand-made weights (preferred for your clarity).") + } + } + + data[, c("n") := mc$n] + + } + + + # making weighted table of aggregated values --------------------------------- + ## NOTE: at-risk counts require special treatment when surv.breaks + ## are a subset of the available breaks: cannot sum at-risk figures! + ## instead should simply pick the value at the start of the + ## (now larger) interval. Will accomplish this by setting values not + ## at the start of an interval to zero and summing anyway. + if (surv.method == "lifetable") { + wh_internal <- list(surv.breaks) + names(wh_internal) <- surv.scale + wh_internal <- data[wh_internal, on = eval(surv.scale), which = TRUE] + wh_internal <- setdiff(1:nrow(data), wh_internal) + mc[wh_internal, intersect(c("n", "n.pp"), names(mc)) := 0L] + } + + ## NOTE: while ssSub will pass the whole column of e.g. fot values, which will + ## not limit the data to e.g. up 5 years of follow-up if original data went + ## further, surv.breaks may be only up to 5 years and will limit the data + ## in makeWeightsDT using a CJ-merge-trick appropriately (via custom.levels). + bl <- list(surv.breaks) + setattr(bl, "names", surv.scale) + + adjust <- evalPopArg(data, adjust, enclos = PF, naming = "model") + + iws <- NULL + if (is.character(weights) && pmatch(weights, c("internal", "cohort"), 0)) { + if (!"n" %in% names(data)) { + stop("Need 'n' specified for when using internal weights: Internal ", + "weights are computed as the counts of subjects at the start of ", + "follow-up.") + } + iws <- makeTempVarName(data, pre = "internal_weights_") + data[, c(iws) := 0.0] + data[data[[surv.scale]] == surv.breaks[1], c(iws) := n] + } + + data <- makeWeightsDT(data = data, values = list(mc), enclos = PF, + print = NULL, formula = formula, adjust = adjust, + by.other = surv.scale, Surv.response = FALSE, + custom.levels = bl, weights = weights, + internal.weights.values = iws, + custom.levels.cut.low = surv.scale) + + allVars <- attr(data, "makeWeightsDT") + allVars[] <- lapply(allVars, function(x) if (length(x) == 0L) NULL else x) + prVars <- allVars$prVars + adVars <- allVars$adVars + # boVars <- allVars$boVars ## this is surv.scale + valVars <- allVars$vaVars + + ## to avoid e.g. 'factor(sex, 1:2)' going bonkers + prVars_orig <- prVars + if (length(prVars) > 0L) { + prVars <- makeTempVarName(names = c(names(data), adVars), + pre = paste0("print_", 1:length(prVars))) + } + adVars_orig <- adVars + if (length(adVars) > 0L) { + adVars <- makeTempVarName(names = c(names(data), prVars), + pre = paste0("print_", 1:length(adVars))) + } + if (length(c(prVars, adVars))) setnames(data, c(prVars_orig, adVars_orig), c(prVars, adVars)) + byVars <- c(prVars, adVars) + + # formulate some needed variables -------------------------------------------- + setkeyv(data, c(byVars, surv.scale)) + data[, "Tstop" := surv.breaks[-1]] + setnames(data, surv.scale, "Tstart") + data[, "delta" := Tstop - Tstart] + data[, "surv.int" := 1:.N, by = eval(byVars)] + setcolorder(data, c(byVars, "surv.int", "Tstart", "Tstop", "delta", valVars, intersect(names(data), "weights"))) + + if (surv.method == "lifetable") { + testEvents <- data[, n - shift(n, n = 1, type = "lead", fill = NA), by = eval(byVars)]$V1 + testEvents <- data$n.cens + data$d - testEvents + + if (sum(abs(testEvents), na.rm = TRUE)) { + on.exit({ + + data[, "n.cens + d - (n-lead1_n)" := testEvents] + wh <- testEvents != 0L + wh <- wh & !is.na(wh) + if (interactive()) { + printSD <- c(byVars, "Tstop", "d", "n", "n.cens", + "n.cens + d - (n-lead1_n)") + print(data[wh, .SD, .SDcols = printSD], top = 5, nrow = 10) + + } + + }, add = TRUE) + + stop("Supplied n.cens and d do not sum to total number of events and ", + "censorings based on n alone. Note that lifetable analysis ", + "is currently not supported for period analysis (or other ", + "comparable limitations of data).", + if (interactive())" See table below and check your variables.") + } + rm(testEvents) + data[, "n.eff" := n - n.cens/2L] + } + + + # compute observed survivals ------------------------------------------------ + if (verbose) ostime <- proc.time() + + if (surv.method=="lifetable") { + comp.st.surv.obs.lif(surv.table = data, surv.by.vars = byVars) + } + if (surv.method=="hazard") { + comp.st.surv.obs.haz(surv.table = data, surv.by.vars = byVars) + } + + data <- comp.st.conf.ints(data, al=1-conf.level, surv="surv.obs", transform = conf.type) + + if (verbose) cat("Time taken by computing observed survivals:", timetaken(ostime), "\n") + + + ## empty surv.int checking --------------------------------------------------- + testVar <- if (surv.method == "lifetable") "n" else "pyrs" + ## sum over adjusting variables + data <- test_empty_surv_ints(data, by = c(prVars, adVars), + show.by = c(prVars_orig, adVars_orig), + sum.over = adVars, + test.var = testVar) + + ## sum over nothing + if (length(adVars) > 0L) { + data <- test_empty_surv_ints(data, by = c(prVars, adVars), + show.by = c(prVars_orig, adVars_orig), + sum.over = NULL, test.var = testVar) + } + + ## if adjusting, crop all estimates by adjusting variables + ## to shortest estimate + if (length(adVars)) { + adLe <- data[, list(min = min(surv.int), max = max(surv.int)), keyby = eval(adVars)] + adLe <- c(max(adLe$min), min(adLe$max)) + data <- data[surv.int %in% `:`(adLe[1L], adLe[2L])] + } + + # create and print table of bad surv.ints ------------------------------------ + + badObsSurv <- data$surv.obs == 0 | is.na(data$surv.obs) + if (sum(badObsSurv)) { + + zerotab <- data[badObsSurv, + list(first.bad.surv.int = min(as.integer(surv.int)), + last.bad.surv.int = max(as.integer(surv.int)), + surv.obs=min(surv.obs)), keyby = eval(byVars)] + + + message("Some cumulative surv.obs were zero or NA:") + if (length(byVars)) setnames(zerotab, c(prVars, adVars), c(prVars_orig, adVars_orig)) + print(zerotab) + if (surv.method == "lifetable" && data[surv.obs == 0, .N] > 0) { + message("NOTE: Zero surv.obs leads to zero relative survivals as well. Adjusting with weights WILL use the zero surv.obs / relative survival values.") + } + + } + rm(badObsSurv) + + # compute cause-specific survivals ------------------------------------------ + if (surv.type == "surv.cause") { + + ## NOTE: these related to adjusting life-table estimates for delayed entry... + # data[, "n.eff" := n - n.cens/2 + n.de/2 + n.de.cens/4] # + d.de/2 + # "n.cens_1" := n.cens + (d-d_1) + # "n.de.cens" := n.de.cens + (d.de - d.de_1) + + if (surv.method == "lifetable") { + for (k in eventVars) { + k <- gsub(pattern = "d_", replacement = "", x = k) + d_k <- paste0("d_", k) + # d.de_k <- paste0("d.de_",k) + + n.eff_k <- paste0("n.eff_",k) + + ## old: " := n - (n.cens + (d-", d_k,")/2 + n.de/2 + (n.de.cens + d.de - ", d.de_k,")/4 )" + # expr <- paste0(n.eff_k, " := n - (n.cens + (d-", d_k,")/2 )") + + set(data, j = c(n.eff_k), value = data$n.eff + (data$d - data[[d_k]])/2L ) # + d.de/2 + # data[, eval(parse(text = expr), envir = .SD)] + + } + + } + + surv_names <- names(data)[grep("surv.obs", names(data))] + surv_names <- c("d", if (surv.method == "lifetable") "n.eff" else NULL, surv_names) + setnames(data, surv_names, paste0(surv_names, ".orig")) + + for (k in eventVars) { + + k <- gsub(pattern = "d.", replacement = "", x = k) + setnames(data, paste0("d.",k), "d") + + if (surv.method=="hazard") { + comp.st.surv.obs.haz(surv.table = data, surv.by.vars = byVars) + } else { + setnames(data, paste0("n.eff_", k), "n.eff") + comp.st.surv.obs.lif(surv.table = data, surv.by.vars = byVars) + } + os.table <- comp.st.conf.ints(data, al=1-conf.level, surv="surv.obs", transform = conf.type) + + new_surv_names <- setdiff(surv_names, c("d", if (surv.method == "lifetable") "n.eff" else NULL)) + new_surv_names <- gsub("surv.obs", paste0("surv.obs.", k), new_surv_names) + new_surv_names <- c(paste0(c("d.", if (surv.method == "lifetable") "n.eff." else NULL), k), new_surv_names) + setnames(data, surv_names, new_surv_names) + + + } + setnames(data, paste0(surv_names, ".orig"), surv_names) + } + + # compute cause-specific/excess-case CIFs ------------------------------------ + if (surv.type %in% c("cif.obs", "cif.rel")) { + + data[, "lag1_surv.obs" := shift(surv.obs, n = 1L, type = "lag", fill = 1), by = eval(byVars)] + data[, "p.obs" := surv.obs/lag1_surv.obs] + + if (surv.type == "cif.obs") { + for (k in eventVars) { + + k <- gsub("d.", "", x = k) + d.k <- paste0("d.", k) + + d.var <- paste0("d.",k) + q.var <- paste0("q.", k) + CIF_var <- paste0("CIF_", k) + data[, (q.var) := (1-p.obs)*get(d.var)/d] + data[get(d.var) == 0L | d == 0L, (q.var) := 0] + data[, (CIF_var) := cumsum(lag1_surv.obs*get(q.var)), by = eval(byVars)] + } + } + + if (surv.type == "cif.rel") { + ## assuming d.exp in data + data[, "CIF.rel" := (1-p.obs)*(d-d.exp)/d] + data[d.exp>d, "CIF.rel" := NA] + data[, "CIF.rel" := cumsum(lag1_surv.obs*CIF.rel), by = eval(byVars)] + } + + ## SEs currently not known for CIFs; impute 0 to make adjusting work + CIF_vars <- names(data)[substr(names(data),1,3) == "CIF"] + data[, c(paste0("SE.", CIF_vars)) := 0L] + + setcolsnull(data, c("lag1_surv.obs", "p.obs", paste0("q.", substr(eventVars, 3, nchar(eventVars))))) + + } + + + # relative survivals --------------------------------------------------------- + if (surv.type == "surv.rel" & relsurv.method == "e2") { + + # compute r.e2 ------------------------------------------------------------- + comp.st.rs <- function(rs.table, rs.by.vars = byVars) { + + p.exp <- delta <- surv.exp <- surv.obs <- n.eff.pp <- + surv.obs <- NULL ## APPEASE R CMD CHECK + ## EdererII + + ##------------- + if (surv.method == "hazard") { + rs.table[, "p.exp" := exp(-delta*d.exp/pyrs)] + rs.table[, "surv.exp" := cumprod(p.exp), by = eval(rs.by.vars)] + comp.st.r.e2.haz(surv.table = rs.table, surv.by.vars = rs.by.vars) + } else { + rs.table[, "p.exp" := 1 - d.exp/n] + rs.table[, "surv.exp" := cumprod(p.exp), by = eval(rs.by.vars)] + + if (rs.table[, min(surv.obs, na.rm=T) == 0]) { + rs.table[surv.obs == 0, "surv.exp" := 1] + } + + comp.st.r.e2.lif(surv.table = rs.table, surv.by.vars = rs.by.vars) + + if (rs.table[, min(surv.obs, na.rm=T) == 0]) { + rs.table[surv.obs == 0, intersect(c("surv.exp","r.e2","SE.r.e2","r.e2.lo","r.e2.hi"), names(rs.table)) := 0] + } + } + + ## ------------ + + rs.table <- comp.st.conf.ints(rs.table, al=1-conf.level, surv="r.e2", transform = conf.type) + + return(rs.table) + } + + data <- comp.st.rs(rs.table = data) + + + } + + # compute r.pp --------------------------------------------------------------- + if (surv.type == "surv.rel" & relsurv.method == "pp") { + + all_names_present(data, c("d.pp", "d.exp.pp", "d.pp.2")) + ## pohar perme: analysis weighted by expected cumulative survival + comp.st.pp <- function(pp.table, by.vars = byVars) { + ## relative survival + if (surv.method == "hazard") { + all_names_present(data, c("pyrs.pp"), + msg = paste0("internal error: work data did not have", + " variable named pyrs.pp. Complain ", + "to package maintainer if you see this.")) + comp.st.r.pp.haz(surv.table = pp.table, surv.by.vars = by.vars) + } else { + data[, "n.eff.pp" := n.pp - 0.5*n.cens.pp] + all_names_present(data, c("n.pp", "n.cens.pp", "n.eff.pp"), + msg = paste0("internal error: work data did not have", + " variable named n.eff.pp. Complain ", + "to package maintainer if you see this.")) + comp.st.r.pp.lif(surv.table = pp.table, surv.by.vars = by.vars) + + if (pp.table[, min(surv.obs, na.rm=T) == 0]) { + pp.table[surv.obs == 0, intersect(c("r.pp","SE.r.pp","r.pp.lo","r.pp.hi"), names(pp.table)) := 0] + } + } + + pp.table <- comp.st.conf.ints(pp.table, al=1-conf.level, surv="r.pp", transform = conf.type ) + + return(pp.table) + } + data <- comp.st.pp(pp.table = data) + } + + # compute adjusted estimates ------------------------------------------------- + if ("weights" %in% names(data)) { + w_est_vars <- names(data)[substr(names(data), 1, 8) == "surv.obs"] + w_est_vars <- c(w_est_vars, "r.e2", "r.pp") + w_est_vars <- c(w_est_vars, names(data)[substr(names(data),1,3)=="CIF"]) + w_est_vars <- intersect(w_est_vars, names(data)) + w_est_vars <- w_est_vars[unlist(lapply(w_est_vars, function(x) { + !substr(x, nchar(x)-2L, nchar(x)) %in% c(".lo", ".hi") + }))] + w_se_vars <- paste0("SE.", w_est_vars) + + w_est <- data[, lapply(.SD, function(x) sum(x*weights)), + keyby = c(prVars, "surv.int"), .SDcols = w_est_vars] + w_se <- data[, lapply(.SD, function(x) sqrt(sum((x^2)*(weights^2)))), + keyby = c(prVars, "surv.int"), .SDcols = w_se_vars] + + data <- data[, lapply(mget(valVars), sum), + keyby = c(prVars, "surv.int", "Tstart", "Tstop", "delta")] + set(data, j = w_se_vars, value = mget(w_se_vars, as.environment(w_se))) + set(data, j = w_est_vars, value = mget(w_est_vars, as.environment(w_est))) + + setnames(data, old = c(w_est_vars, w_se_vars), + new = paste0(c(w_est_vars, w_se_vars), ".as")) + + for (var in paste0(w_est_vars, ".as")) { + data <- comp.st.conf.ints(data, al = 1-conf.level, + surv = var, transform = conf.type) + } + + + } + + # clean-up ------------------------------------------------------------------- + ## back to original names of print / adjust (used to avoid e.g. + ## 'factor(V1, 1:2)' going bonkers in data.table) + if (length(c(prVars))) setnames(data, c(prVars), c(prVars_orig)) + prVars <- prVars_orig + adVars <- adVars_orig + + ## reorder table, format numeric values, etc. + + miscVars <- intersect(names(data), c("surv.int", "Tstart", "Tstop", "delta")) + + survVars <- c("surv.obs.lo","surv.obs","surv.obs.hi","SE.surv.obs", + "r.e2.lo","r.e2","r.e2.hi","SE.r.e2", + "r.pp.lo","r.pp","r.pp.hi","SE.r.pp", + paste0("CIF.rel.", c("lo", "", "hi")), "SE.CIF.rel", + "surv.obs.as.lo","surv.obs.as","surv.obs.as.hi","SE.surv.obs.as", + "r.e2.as.lo","r.e2.as","r.e2.as.hi","SE.r.e2.as", + "r.pp.as.lo","r.pp.as","r.pp.as.hi","SE.r.pp.as", + paste0("CIF.rel.as.", c("lo", "", "hi")), "SE.CIF.rel.as" + ) + survVars <- intersect(survVars, names(data)) + + ## which variables are estimates, SEs, CIs, etc. + survVars.ca <- setdiff(names(data), c(prVars, valVars, miscVars, survVars)) + CIF_vars <- survVars.ca[substr(survVars.ca, 1,3)=="CIF" | substr(survVars.ca, 1,6)=="SE.CIF"] + survVars <- c(survVars, CIF_vars) + + surv.obs.vars <- survVars.ca[substr(survVars.ca, 1,8) == "surv.obs" | substr(survVars.ca, 1,11) == "SE.surv.obs"] + survVars <- c(survVars, surv.obs.vars) + + survVars <- unique(intersect(survVars, names(data))) + + ## remove some unuseful variables + setcolsnull(data, c("SE.A", "SE.B")) + setcolsnull(data, survVars[substr(survVars, 1, 6) == "SE.CIF"]) ## since they are zero for now + survVars <- intersect(survVars, names(data)) + + SEVars <- survVars[substr(survVars, 1, 3) == "SE."] + CIVars <- survVars[substr(survVars, nchar(survVars) - 2L, nchar(survVars)) %in% c(".lo", ".hi")] + estVars <- setdiff(survVars, c(SEVars, CIVars)) + + order <- unique(c(prVars, miscVars, valVars, survVars)) + order <- intersect(order, names(data)) + + setcolsnull(data, setdiff(names(data), order)) + setcolorder(data,order) + + setkeyv(data, c(prVars, "surv.int")) + + # attributes ----------------------------------------------------------------- + setkeyv(data, c(prVars, "surv.int")) + setattr(data, "class", c("survtab", "data.table", "data.frame")) + if (!return_DT()) setDFpe(data) + if (length(prVars) == 0) prVars <- NULL ## might be character(0) + + used_args$data <- origData + used_args$formula <- formula + used_args$weights <- evalRecursive(arg = weights, env = PF)$weights + + arglist <- list(call = this_call, + arguments = used_args, + surv.scale = surv.scale, + surv.breaks = surv.breaks, + print.vars = prVars, + adjust.vars = adVars, + value.vars = valVars, + misc.vars = miscVars, + surv.vars = survVars, + est.vars = estVars, + SE.vars = SEVars, + CI.vars = CIVars) + varsArgs <- substr(names(arglist), nchar(names(arglist))-4L, nchar(names(arglist))) == ".vars" + varsArgs <- names(arglist)[varsArgs] + arglist[varsArgs] <- lapply(arglist[varsArgs], function(x) if (length(x) == 0L) NULL else x) + + setattr(data, "survtab.meta", arglist) + + if (verbose) cat("Time taken by whole process: ", timetaken(starttime), "\n") + data[] +} + + +# ag <- lexpand(sire, birth = "bi_date", entry = "dg_date", exit = "ex_date", +# status = status %in% 1:2, pophaz = popmort, pp = TRUE, +# aggre = list(sex, fot), fot = seq(0, 5, 1/12)) +# ag[, d.exp := pmax(0L, from0to1 - 3L)] +# st <- survtab_ag(ag, surv.type = "surv.obs", surv.method = "hazard") +# st <- survtab_ag(ag, surv.type = "surv.cause", surv.method = "hazard", d = list(a = from0to1-3, b = 3)) + +# sire <- copy(sire) +# sire$sex <- rbinom(nrow(sire), size = 1, prob = 0.5) +# ag <- lexpand(sire, birth = "bi_date", entry = "dg_date", exit = "ex_date", +# status = status %in% 1:2, pophaz = popmort, pp = TRUE, +# aggre = list(sex, agegr = cut(dg_age, c(0,60,70,80, Inf), labels = FALSE), fot), +# fot = seq(0, 5, 1/12)) +# ag <- lexpand(sire, birth = "bi_date", entry = "bi_date", exit = "ex_date", +# status = status %in% 1:2, +# aggre = list(sex, age), +# age = seq(0, 100, 1)) +# wdt <- data.table(agegr = 1:4, weights = c(0.2, 0.4, 0.3, 0.1)) +# wli <- list(agegr = c(0.2, 0.4, 0.3, 0.1)) +# st <- survtab_ag(fot ~ sex + adjust(agegr), data = ag, surv.type = "surv.obs", surv.method = "hazard", weights = wli) +# st <- survtab_ag(fot ~ sex + adjust(agegr), data = ag, surv.type = "surv.rel", +# d.pp = "from0to1.pp", d.pp.2 = "from0to1.pp.2", +# d.exp.pp = "d.exp.pp", pyrs.pp = "ptime.pp", +# surv.method = "hazard", weights = wli, +# relsurv.method = "pp") +# ag <- lexpand(sire, birth = "bi_date", entry = "dg_date", exit = "ex_date", +# status = status, pophaz = popmort, pp = TRUE, +# aggre = list(sex, agegr = cut(dg_age, c(0,60,70,80, Inf), labels = FALSE), fot), +# fot = seq(0, 5, 1/12)) +# st <- survtab_ag(fot ~ sex + adjust(agegr), data = ag, +# d = list(cand = from0to1, othd = from0to2), +# surv.type = "surv.cause", weights = wli) +# st <- survtab_ag(fot ~ sex, data = ag, surv.type = "surv.obs", surv.method = "hazard", adjust = "agegr", weights = wli) +# st <- survtab_ag(fot ~ adjust(agegr), data = ag, surv.type = "surv.obs", weights = wli) +# st <- survtab_ag(fot ~ 1, data = ag, adjust = "agegr", surv.type = "surv.obs", weights = wli) +# st <- survtab_ag(fot ~ 1, data = ag, adjust = "agegr", surv.type = "surv.obs", weights = wli) +# st <- survtab_ag(fot ~ 1, data = ag, surv.type = "surv.obs") + +# wli2 <- wli +# wli$sex <- c(0.4, 0.6) +# st <- survtab_ag(fot ~ adjust(sex, agegr), data = ag, surv.type = "surv.obs", weights = wli) +# st <- survtab_ag(fot ~ adjust(agegr), data = ag, surv.type = "surv.obs", weights = wli["agegr"]) +# ag[, d.exp := pmax(from0to1 - 1, 0L)] +# st <- survtab_ag(fot ~ adjust(sex, agegr), data = ag, surv.type = "surv.rel", weights = wli) +# st <- survtab_ag(fot ~ adjust(sex, agegr), data = ag, surv.type = "surv.cause", weights = wli) +# ag[, othd := pmax(from0to1 - 1L, 0L)] +# st <- survtab_ag(fot ~ adjust(sex, agegr), data = ag, d = list(cand = from0to1, othd = pmax(from0to1-1L, 0L)), surv.type = "surv.cause", weights = wli) diff --git a/R/survival_lexis.R b/R/survival_lexis.R index 93ebcad..9137b2f 100644 --- a/R/survival_lexis.R +++ b/R/survival_lexis.R @@ -1,594 +1,594 @@ - - - - - -#' @template survival_doc_template -#' @param formula a \code{formula}; e.g. \code{fot ~ sex}, -#' where \code{fot} is the time scale over which you wish to estimate a -#' survival time function; this -#' assumes that \code{lex.Xst} in your data is the status variable in the -#' intended format (almost always right). -#' To be explicit, use \code{\link[survival]{Surv}}: e.g. -#' \code{Surv(fot, lex.Xst) ~ sex}. -#' Variables on the right-hand side of the formula -#' separated by \code{+} are considered stratifying variables, for which -#' estimates are computed separately. May contain usage of \code{adjust()} -#' --- see Details and Examples. -#' @param data a \code{Lexis} object with at least the survival time scale -#' @param breaks a named list of breaks, e.g. -#' \code{list(FUT = 0:5)}. If data is not split in advance, \code{breaks} -#' must at the very least contain a vector of breaks to split the survival time -#' scale (mentioned in argument \code{formula}). If data has already been split -#' (using e.g. \code{\link{splitMulti}}) along at least the used survival time -#' scale, this may be \code{NULL}. It is generally recommended (and sufficient; -#' see Seppa, Dyban and Hakulinen (2015)) to use monthly -#' intervals where applicable. -#' @param pophaz a \code{data.frame} containing -#' expected hazards for the event of interest to occur. See the -#' \link[=pophaz]{dedicated help page}. Required when -#' \code{surv.type = "surv.rel"} or \code{"cif.rel"}. \code{pophaz} must -#' contain one column named \code{"haz"}, and any number of other columns -#' identifying levels of variables to do a merge with split data within -#' \code{survtab}. Some columns may be time scales, which will -#' allow for the expected hazard to vary by e.g. calendar time and age. -#' -#' -#' -#' @examples -#' \dontrun{ -#' data("sire", package = "popEpi") -#' library(Epi) -#' library(survival) -#' -#' ## NOTE: recommended to use factor status variable -#' x <- Lexis(entry = list(FUT = 0, AGE = dg_age, CAL = get.yrs(dg_date)), -#' exit = list(CAL = get.yrs(ex_date)), -#' data = sire[sire$dg_date < sire$ex_date, ], -#' exit.status = factor(status, levels = 0:2, -#' labels = c("alive", "canD", "othD")), -#' merge = TRUE) -#' -#' ## phony group variable -#' set.seed(1L) -#' x$group <- rbinom(nrow(x), 1, 0.5) -#' -#' ## observed survival. explicit supplying of status: -#' st <- survtab(Surv(time = FUT, event = lex.Xst) ~ group, data = x, -#' surv.type = "surv.obs", -#' breaks = list(FUT = seq(0, 5, 1/12))) -#' ## this assumes the status is lex.Xst (right 99.9 % of the time) -#' st <- survtab(FUT ~ group, data = x, -#' surv.type = "surv.obs", -#' breaks = list(FUT = seq(0, 5, 1/12))) -#' -#' ## relative survival (ederer II) -#' data("popmort", package = "popEpi") -#' pm <- data.frame(popmort) -#' names(pm) <- c("sex", "CAL", "AGE", "haz") -#' st <- survtab(FUT ~ group, data = x, -#' surv.type = "surv.rel", -#' pophaz = pm, -#' breaks = list(FUT = seq(0, 5, 1/12))) -#' -#' ## ICSS weights usage -#' data("ICSS", package = "popEpi") -#' cut <- c(0, 30, 50, 70, Inf) -#' agegr <- cut(ICSS$age, cut, right = FALSE) -#' w <- aggregate(ICSS1~agegr, data = ICSS, FUN = sum) -#' x$agegr <- cut(x$dg_age, cut, right = FALSE) -#' st <- survtab(FUT ~ group + adjust(agegr), data = x, -#' surv.type = "surv.rel", -#' pophaz = pm, weights = w$ICSS1, -#' breaks = list(FUT = seq(0, 5, 1/12))) -#' -#' #### using dates with survtab -#' x <- Lexis(entry = list(FUT = 0L, AGE = dg_date-bi_date, CAL = dg_date), -#' exit = list(CAL = ex_date), -#' data = sire[sire$dg_date < sire$ex_date, ], -#' exit.status = factor(status, levels = 0:2, -#' labels = c("alive", "canD", "othD")), -#' merge = TRUE) -#' ## phony group variable -#' set.seed(1L) -#' x$group <- rbinom(nrow(x), 1, 0.5) -#' -#' st <- survtab(Surv(time = FUT, event = lex.Xst) ~ group, data = x, -#' surv.type = "surv.obs", -#' breaks = list(FUT = seq(0, 5, 1/12)*365.25)) -#' -#' ## NOTE: population hazard should be reported at the same scale -#' ## as time variables in your Lexis data. -#' data(popmort, package = "popEpi") -#' pm <- data.frame(popmort) -#' names(pm) <- c("sex", "CAL", "AGE", "haz") -#' ## from year to day level -#' pm$haz <- pm$haz/365.25 -#' pm$CAL <- as.Date(paste0(pm$CAL, "-01-01")) -#' pm$AGE <- pm$AGE*365.25 -#' -#' st <- survtab(Surv(time = FUT, event = lex.Xst) ~ group, data = x, -#' surv.type = "surv.rel", relsurv.method = "e2", -#' pophaz = pm, -#' breaks = list(FUT = seq(0, 5, 1/12)*365.25)) -#' } -#' @export -#' @importFrom survival Surv -survtab <- function(formula, data, adjust = NULL, breaks = NULL, - pophaz = NULL, weights = NULL, surv.type = "surv.rel", - surv.method = "hazard", relsurv.method = "e2", - subset = NULL, - conf.level = 0.95, - conf.type = "log-log", - verbose = FALSE) { - - TF <- environment() - PF <- parent.frame() - this_call <- match.call() - startTime <- proc.time() - - if (!requireNamespace("survival")) { - stop("Need package 'survival' to proceed") - } - - ## appease R CMD CHECK ------------------------------------------------------- - lex.Cst <- lex.Xst <- lex.dur <- NULL - - ## checks -------------------------------------------------------------------- - - if (missing(formula)) stop("Formula not defined!") - - checkLexisData(data) - - allScales <- attr(data, "time.scales") - splitScales <- names(breaks) - - ## ensure breaks make sense -------------------------------------------------- - oldBreaks <- attr(data, "breaks") - checkBreaksList(data, breaks = oldBreaks) - testOldBreaks <- setdiff(oldBreaks, list(NULL)) - if (is.null(breaks) && !length(testOldBreaks)) { - stop("No breaks supplied via argument 'breaks', and data has not been ", - "split in advance. Please supply a list of breaks ", - "to argument 'breaks'") - } - if (is.null(breaks)) breaks <- oldBreaks - checkBreaksList(data, breaks = breaks) - ## match break types to time scale types - ## (don't try to match time scales to breaks) - splitScales <- names(breaks) - for (k in splitScales) { - breaks[[k]] <- matchBreakTypes(data, breaks = breaks[[k]], timeScale = k) - } - - comp_pp <- FALSE - drop <- TRUE - if (surv.type == "surv.rel" && relsurv.method == "pp") comp_pp <- TRUE - if (comp_pp) drop <- FALSE - - - ## data & subset ------------------------------------------------------------- - subset <- evalLogicalSubset(data, substitute(subset)) - x <- data[subset, ]; rm(subset) - setDT(x) - forceLexisDT(x, breaks = NULL, allScales = allScales, key = TRUE) - - ## pre-eval of print & adjust ------------------------------------------------ - - adSub <- substitute(adjust) - adTest <- evalRecursive(adSub, env = x, enc = PF) - if (!is.null(adTest)) { - adSub <- adTest$argSub - adVars <- all.vars(adSub) - } else { - adSub <- substitute(NULL) - adVars <- NULL - } - - formula <- evalRecursive(formula, env = TF, enc = PF)$arg - foVars <- all.vars(formula) - - if (!inherits(formula,"formula")) { - stop("Argument 'formula' is not a formula object. Usage: e.g. ", - "Surv(fot, lex.Xst %in% 1:2) ~ sex") - } - if (length(formula) != 3L) { - stop("Argument 'formula'must be two-sided. Usage: e.g. ", - "Surv(fot, lex.Xst %in% 1:2) ~ sex") - } - ## eval print & adjust ------------------------------------------------------- - ## this adjust passed to resulting data's attributes at the end - adSub <- substitute(adjust) - adjust <- evalPopArg(data = x, arg = adSub, - enclos = PF, DT = TRUE, - recursive = TRUE) - - l <- usePopFormula(form = formula, adjust = adjust, data = x, enclos = PF, - Surv.response = "either") - prVars <- names(l$print) - adVars <- names(l$adjust) - - - ## check weights makes sense with respect to adjust -------------------------- - if (length(adVars) > 0L && !is.null(weights)) { - checkWeights(weights, adjust = l$adjust) - - } - - ## check pophaz -------------------------------------------------------------- - - if (surv.type %in% c("surv.rel", "cif.rel")) { - checkPophaz(x, pophaz, haz.name = "haz") - } - pophazVars <- setdiff(names(pophaz), "haz") - - ## only keep necessary variables --------------------------------------------- - - setcolsnull(x, keep = c("lex.id", "lex.dur", allScales, - "lex.Cst", "lex.Xst", pophazVars)) - if (length(prVars)) x[, c(prVars)] <- l$print - if (length(adVars)) x[, c(adVars)] <- l$adjust - - - ## simplify event and censoring indicators ----------------------------------- - cens.values <- event.values <- NULL - all.values <- if (is.factor(l$y$status)) levels(l$y$status) else - sort(unique(l$y$status)) - cens.values <- all.values[1L] - event.values <- setdiff(all.values, cens.values) - - if (is.numeric(l$y$status) && all(unique(l$y$status) %in% 0:1)) { - ## this should apply to situations where status coded 0/1 - ## and both 0/1 present or only 1 present - if (all(unique(l$y$status) %in% 0L)) { - stop("All status values were zero, i.e. all obs were censored. ", - "Check that you passed the correct status variable or --- if this ", - "was intended --- code the status variable to 0/1 so that 1 ", - "corresponds to the event taking place and 0 not.") - } - cens.values <- 0L - event.values <- 1L - } - - x[, lex.Cst := NULL] - x[, lex.Cst := TF$cens.values] - x[, lex.Xst := NULL] - x[, lex.Xst := l$y$status] - harmonizeStatuses(x, C = "lex.Cst", X = "lex.Xst") - - if (!surv.type %in% c("cif.obs", "surv.cause")) { - ## this simplifies computations - - x[, lex.Cst := NULL] - x[, lex.Cst := 0L] - setcolorder(x, c(intersect(names(data), names(x)), - setdiff(names(x), names(data)))) - - x[, lex.Xst := as.integer(lex.Xst %in% TF$event.values)] - cens.values <- 0L - event.values <- 1L - if (x[, sum(lex.Xst)] == 0L) { - stop("There are no events in the data. Ensure that the event argument ", - "used in Surv() makes sense.") - } - } - - ## detect which time scale used ---------------------------------------------- - - survScale <- detectSurvivalTimeScale(lex = x, values = l$y$time) - - - ## crop data to speed up computations ---------------------------------------- - cropBreaks <- breaks - if (surv.type == "surv.rel" && relsurv.method == "pp") { - ## pp-weights have to be computed from entry to follow-up till roof of breaks; - ## can only crop along the survival time scale - cropBreaks <- breaks[1L] - cb <- protectFromDrop(cropBreaks[[1L]], lower = TRUE) - cb <- c(min(cb), max(cropBreaks[[1L]])) - cropBreaks[[1L]] <- cb - } - - - intelliCrop(x = x, breaks = cropBreaks, allScales = allScales, cropStatuses = TRUE) - x <- intelliDrop(x, breaks = cropBreaks, dropNegDur = TRUE, check = TRUE) - setDT(x) - forceLexisDT(x, breaks = oldBreaks, allScales = allScales, key = TRUE) - - ## splitting ----------------------------------------------------------------- - - splitTime <- proc.time() - setDT(x) - forceLexisDT(x, breaks = oldBreaks, allScales = allScales, key = TRUE) - x <- splitMulti(x, breaks = breaks, drop = FALSE, merge = TRUE) - setDT(x) - forceLexisDT(x, breaks = breaks, allScales = allScales, key = TRUE) - if (verbose) cat("Time taken by splitting Lexis data: ", timetaken(splitTime), "\n") - - ## pophaz merge -------------------------------------------------------------- - if (!is.null(pophaz)) { - hazTime <- proc.time() - haz <- NULL ## appease R CMD CHECK - x <- cutLowMerge(x, pophaz, by = pophazVars, - mid.scales = intersect(pophazVars, allScales)) - setDT(x) - forceLexisDT(x, breaks = breaks, allScales =allScales, key = TRUE) - if (verbose) cat("Time taken by merging population hazards with split Lexis data: ", timetaken(hazTime), "\n") - } - - ## pp computation ------------------------------------------------------------ - ppNames <- d.pp <- d.pp.2 <- d.exp.pp <- ptime.pp <- - at.risk.pp <- n.cens.pp <- NULL - - if (comp_pp) { - ppTime <- proc.time() - setkeyv(x, c("lex.id", survScale)) - comp_pp_weights(x, surv.scale = survScale, - breaks = breaks[[survScale]], haz = "haz", - style = "delta", verbose = FALSE) - setDT(x) - forceLexisDT(x, breaks = breaks, allScales = allScales, key = TRUE) - if (verbose) cat("Time taken by computing Pohar-Perme weights: ", timetaken(ppTime), "\n") - - intelliCrop(x = x, breaks = breaks, allScales = allScales, cropStatuses = TRUE) - x <- intelliDrop(x, breaks = breaks, dropNegDur = TRUE, check = TRUE) - forceLexisDT(x, breaks = breaks, allScales = allScales, key = TRUE) - - ppTime <- proc.time() - pp <- comp_pp_weighted_figures(x, haz = "haz", pp = "pp", by = "lex.id") - ppNames <- makeTempVarName(x, pre = names(pp)) - x[, c(TF$ppNames) := TF$pp] ## note: TF$pp avoids conflicts - rm(pp) - - d.pp.2 <- ppNames[substr(ppNames, 1, 13) == "from0to1.pp.2"] - d.pp <- ppNames[substr(ppNames, 1, 11) == "from0to1.pp"] - d.pp <- setdiff(d.pp, d.pp.2) - d.exp.pp <- ppNames[substr(ppNames, 1, 8) == "d.exp.pp"] - ptime.pp <- ppNames[substr(ppNames, 1, 8) == "ptime.pp"] - n.cens.pp <- ppNames[substr(ppNames, 1, 11) == "from0to0.pp"] - n.cens.pp <- n.cens.pp[substr(n.cens.pp, 1,13) != "from0to0.pp.2"] - at.risk.pp <- ppNames[substr(ppNames, 1, 10) == "at.risk.pp"] - d.exp.pp <- ppNames[substr(ppNames, 1, 8) == "d.exp.pp"] - - if (verbose) cat("Time taken by computing Pohar-Perme weighted ", - "counts and person-times: ", timetaken(ppTime), "\n") - } - - d.exp <- NULL - if (surv.type %in% c("surv.rel", "cif.rel") && "haz" %in% names(x)) { - d.exp <- makeTempVarName(x, pre = "d.exp_") - x[, c(TF$d.exp) := lex.dur * haz] - } - - ## aggregation --------------------------------------------------------------- - aggreTime <- proc.time() - - ## this includes time scale to compute survivals over - aggreVars <- c(prVars, adVars, survScale) - - setDT(x) - forceLexisDT(x, breaks = breaks, allScales = allScales, key = TRUE) - if (verbose) cat("** verbose messages from aggre(): \n") - x <- aggre(x, by = aggreVars, verbose = verbose, - sum.values = c(d.exp, ppNames)) - if (verbose) cat("** end of verbose messages from aggre() \n") - setDT(x) - setattr(x, "class", c("aggre", "data.table", "data.frame")) - if (verbose) cat("Time taken by aggregating split Lexis data: ", - timetaken(aggreTime), "\n") - - ## neater column names ------------------------------------------------------- - ## in case there are zero obs that are censored - censCols <- paste0("from", cens.values, "to", cens.values) - if (all(!censCols %in% names(x))) { - x[, c(censCols) := 0L] - } - - ## e.g. fromAlivetoDead -> Dead; looks better in survtab_ag output - evCols <- paste0("from", cens.values, "to", c(cens.values, event.values)) - whEC <- which(evCols %in% names(x)) - - if (sum(whEC)) { - setnames(x, evCols[whEC], - as.character(c(cens.values, event.values)[whEC])) - } - - ## survtab_ag ---------------------------------------------------------------- - dn <- intersect(event.values, names(x)) - if (length(dn) == 0L) { - stop("Internal error: no event variables in work data. Complain to the ", - "package maintainer if you see this - unless there are no events ", - "in the data?") - } - n.cens <- intersect(cens.values, names(x)) - - if (length(prVars) == 0L) { - prVars <- "1" - } - - form <- as.formula(paste0(survScale, " ~ ", paste0(prVars, collapse = " + "))) - - if (verbose) cat("** verbose messages from survtab_ag(): \n") - - st <- survtab_ag(data = x, - formula = TF$form, - adjust = TF$adVars, - - weights = TF$weights, - - d = TF$dn, pyrs = "pyrs", n = "at.risk", - d.exp = TF$d.exp, n.cens = TF$n.cens, - - n.pp = TF$at.risk.pp, - d.pp = TF$d.pp, d.exp.pp = TF$d.exp.pp, d.pp.2 = TF$d.pp.2, - n.cens.pp = TF$n.cens.pp, pyrs.pp = TF$ptime.pp, - - surv.type = surv.type, - surv.method = surv.method, - relsurv.method = relsurv.method, - - conf.type = conf.type, - conf.level = conf.level, - - verbose = verbose) - if (verbose) cat("** end of verbose messages from survtab_ag() \n") - ## attributes ---------------------------------------------------------------- - - attributes(st)$survtab.meta$call <- this_call - attributes(st)$survtab.meta$arguments$adjust <- adjust - attributes(st)$survtab.meta$arguments$conf.type <- conf.type - attributes(st)$survtab.meta$arguments$conf.level <- conf.level - - attributes(st)$survtab.meta$arguments$surv.type <- surv.type - attributes(st)$survtab.meta$arguments$surv.method <- surv.method - attributes(st)$survtab.meta$arguments$relsurv.method <- relsurv.method - - if (verbose) cat("Total time taken by survtab: ", timetaken(startTime), "\n") - st -} -# library(Epi) -# library(popEpi) -# dt <- copy(sire)[dg_date < ex_date,] -# dt[, agegr := cut(dg_age, c(0,50,75,Inf))] -# dt[, sex := rbinom(n = .N, size = 1, prob = 0.5)] -# dt <- Lexis(data = dt, entry = list(FUT = 0, AGE = dg_age, CAL = get.yrs(dg_date)), -# exit = list(CAL = get.yrs(ex_date)), entry.status = 0L, exit.status = status, merge = TRUE) -# pm <- copy(popEpi::popmort) -# setnames(pm, c("agegroup", "year"), c("AGE", "CAL")) -# st <- survtab(data = dt, formula = Surv(FUT, lex.Xst) ~ 1, #adjust = "agegr", -# # pophaz = pm, -# surv.type = "surv.obs", -# # weights = list(agegr = c(0.2,0.4,0.4)), -# breaks = list(FUT = seq(0,5,1/12))) -# st <- survtab(dt, print = NULL, #adjust = "agegr", -# # pophaz = pm, -# surv.type = "surv.obs", -# # weights = list(agegr = c(0.2,0.4,0.4)), -# breaks = list(AGE = seq(0,100, 1))) -# st <- survtab(dt, print = NULL, #adjust = "agegr", -# pophaz = pm, -# surv.type = "surv.rel", -# relsurv.method = "pp", -# # weights = list(agegr = c(0.2,0.4,0.4)), -# breaks = list(FUT = seq(0,5,1/12))) -# st <- survtab(dt, print = NULL, adjust = c("sex","agegr"), -# pophaz = pm, -# surv.type = "surv.rel", -# relsurv.method = "pp", -# weights = list(sex = c(0.5, 0.5), agegr = c(0.2,0.4,0.4)), -# breaks = list(FUT = seq(0,5,1/12))) - - -# ag <- lexpand(sire, birth = "bi_date", entry = "dg_date", exit = "ex_date", -# status = status %in% 1:2, pophaz = popmort, pp = TRUE, -# fot = seq(0, 5, 1/12)) -# pm2 <- copy(popEpi::popmort) -# setnames(pm2, c("year", "agegroup"), c("per", "age")) -# st <- survtab(ag, print = NULL, #adjust = c("sex","agegr"), -# pophaz = pm2, -# surv.type = "surv.rel", -# relsurv.method = "pp", -# #weights = list(sex = c(0.5, 0.5), agegr = c(0.2,0.4,0.4)), -# breaks = list(fot = seq(0,5,1/12))) -detectEvents <- function(x, breaks, tol = .Machine$double.eps^0.5, by = "lex.id") { - ## INTENTION: given a Lexis object, determines which rows - ## have an event (a transition or end-point) within the window - ## determined by breaks (a list of breaks as supplied to e.g. splitMulti). - ## Usable with split and unsplit data, though it is best to do this - ## before splitting for efficiency. - ## NOTE: by should be a character vector specifying variables that identify - ## unique subjects; the idea is that each subject only has one end-point - ## and some transitions - ## NOTE: breaks should be a list of breaks or NULL; if it is NULL, - ## it is NOT checked whether observations were cut short by the breaks used. - ## observations cut short are not any kind of events. - ## OUTPUT: an integer vector coding events as follows: - ## 0: no event within breaks (row cut short by breaks or subject - ## has multiple rows, of which this is not an event) - ## 1: transition within breaks - ## 2: original end-point within breaks and no transition occured (i.e. censoring) - if (!is.data.table(x)) stop("x must be a data.table; if you see this, send the package maintainer an email") - # checkLexisData(x) - if (!inherits(x, "Lexis")) stop("data not a Lexis object") - if (!is.null(breaks)) { - checkBreaksList(x, breaks) - breaks[unlist(lapply(breaks, length)) == 0L] <- NULL - } - - ## R CMD CHECK appeasement - lex.Cst <- lex.Xst <- NULL - - tmp <- list() - oldKey <- key(x) - if (length(oldKey) == 0L) { - tmp$order <- makeTempVarName(x, pre = "order_") - on.exit(if (tmp$order %in% names(x)) setorderv(x, tmp$order), add = TRUE) - on.exit(setcolsnull(x, tmp$order, soft = TRUE), add = TRUE) - set(x, j = tmp$order, value = 1:nrow(x)) - } else on.exit(setkeyv(x, oldKey), add = TRUE) - - - setkeyv(x, c(by, names(breaks)[1L])) - setkeyv(x, by) - ## rows that actually can be events: transitions and last rows by subject - whTr <- x[, lex.Cst != lex.Xst] - whLa <- !duplicated(x, fromLast = TRUE, by=key(x)) - whEv <- whTr | whLa - - if (!is.null(breaks)) { - - splitScales <- names(breaks) - if (any(!splitScales %in% names(x))) stop("Following time scales missing from data that data was split by: ", paste0("'", setdiff(splitScales, names(x)), "'", collapse = ", ")) - - brmax <- lapply(breaks, max) - brmin <- lapply(breaks, min) - - ## detect rows residing within breaks window - for (sc in splitScales) { - z <- (x$lex.dur + x[[sc]])[whEv] - tol_sc <- if (is.double(z)) tol else 0L - - ## NOTE: if max of orig values within breaks window, then all may be events - if (!(max(z) + tol_sc < brmax[[sc]])) whEv[whEv] <- z < brmax[[sc]] - tol_sc - if (!(min(z) - tol_sc > brmin[[sc]])) whEv[whEv] <- z > brmin[[sc]] + tol_sc - - } - ## whEv now indicates rows that may be events AND which reside within breaks window. - } - - ## censored events are not transitions, but must reside within breaks window. - whCe <- whLa & !whTr & whEv - - ## need to add event indicator to data since it has been reordered, - ## reorder back old order, and return the event indicator. - tmp$ind <- makeTempVarName(x, pre = "event_indicator_") - on.exit(setcolsnull(x, delete = tmp$ind, soft = TRUE), add = TRUE) - evInd <- as.integer(whEv) - evInd <- ifelse(whCe, 2L, evInd) - set(x, j = tmp$ind, value = evInd) - - if (length(oldKey) == 0L) { - setkeyv(x, NULL) - setorderv(x, tmp$order) - set(x, j = tmp$order, value = NULL) - } else setkeyv(x, oldKey) - - - evInd <- x[[tmp$ind]] - set(x, j = tmp$ind, value = NULL) - on.exit(expr = {}, add = FALSE) ## removes on.exit expressions from earlier - - - if (!identical(oldKey, key(x))) stop("keys do not match at function end; send an email to package maintainer if you see this") - - evInd -} - - - - - - - - - + + + + + +#' @template survival_doc_template +#' @param formula a \code{formula}; e.g. \code{fot ~ sex}, +#' where \code{fot} is the time scale over which you wish to estimate a +#' survival time function; this +#' assumes that \code{lex.Xst} in your data is the status variable in the +#' intended format (almost always right). +#' To be explicit, use \code{\link[survival]{Surv}}: e.g. +#' \code{Surv(fot, lex.Xst) ~ sex}. +#' Variables on the right-hand side of the formula +#' separated by \code{+} are considered stratifying variables, for which +#' estimates are computed separately. May contain usage of \code{adjust()} +#' --- see Details and Examples. +#' @param data a \code{Lexis} object with at least the survival time scale +#' @param breaks a named list of breaks, e.g. +#' \code{list(FUT = 0:5)}. If data is not split in advance, \code{breaks} +#' must at the very least contain a vector of breaks to split the survival time +#' scale (mentioned in argument \code{formula}). If data has already been split +#' (using e.g. \code{\link{splitMulti}}) along at least the used survival time +#' scale, this may be \code{NULL}. It is generally recommended (and sufficient; +#' see Seppa, Dyban and Hakulinen (2015)) to use monthly +#' intervals where applicable. +#' @param pophaz a \code{data.frame} containing +#' expected hazards for the event of interest to occur. See the +#' \link[=pophaz]{dedicated help page}. Required when +#' \code{surv.type = "surv.rel"} or \code{"cif.rel"}. \code{pophaz} must +#' contain one column named \code{"haz"}, and any number of other columns +#' identifying levels of variables to do a merge with split data within +#' \code{survtab}. Some columns may be time scales, which will +#' allow for the expected hazard to vary by e.g. calendar time and age. +#' +#' +#' +#' @examples +#' \dontrun{ +#' data("sire", package = "popEpi") +#' library(Epi) +#' library(survival) +#' +#' ## NOTE: recommended to use factor status variable +#' x <- Lexis(entry = list(FUT = 0, AGE = dg_age, CAL = get.yrs(dg_date)), +#' exit = list(CAL = get.yrs(ex_date)), +#' data = sire[sire$dg_date < sire$ex_date, ], +#' exit.status = factor(status, levels = 0:2, +#' labels = c("alive", "canD", "othD")), +#' merge = TRUE) +#' +#' ## phony group variable +#' set.seed(1L) +#' x$group <- rbinom(nrow(x), 1, 0.5) +#' +#' ## observed survival. explicit supplying of status: +#' st <- survtab(Surv(time = FUT, event = lex.Xst) ~ group, data = x, +#' surv.type = "surv.obs", +#' breaks = list(FUT = seq(0, 5, 1/12))) +#' ## this assumes the status is lex.Xst (right 99.9 % of the time) +#' st <- survtab(FUT ~ group, data = x, +#' surv.type = "surv.obs", +#' breaks = list(FUT = seq(0, 5, 1/12))) +#' +#' ## relative survival (ederer II) +#' data("popmort", package = "popEpi") +#' pm <- data.frame(popmort) +#' names(pm) <- c("sex", "CAL", "AGE", "haz") +#' st <- survtab(FUT ~ group, data = x, +#' surv.type = "surv.rel", +#' pophaz = pm, +#' breaks = list(FUT = seq(0, 5, 1/12))) +#' +#' ## ICSS weights usage +#' data("ICSS", package = "popEpi") +#' cut <- c(0, 30, 50, 70, Inf) +#' agegr <- cut(ICSS$age, cut, right = FALSE) +#' w <- aggregate(ICSS1~agegr, data = ICSS, FUN = sum) +#' x$agegr <- cut(x$dg_age, cut, right = FALSE) +#' st <- survtab(FUT ~ group + adjust(agegr), data = x, +#' surv.type = "surv.rel", +#' pophaz = pm, weights = w$ICSS1, +#' breaks = list(FUT = seq(0, 5, 1/12))) +#' +#' #### using dates with survtab +#' x <- Lexis(entry = list(FUT = 0L, AGE = dg_date-bi_date, CAL = dg_date), +#' exit = list(CAL = ex_date), +#' data = sire[sire$dg_date < sire$ex_date, ], +#' exit.status = factor(status, levels = 0:2, +#' labels = c("alive", "canD", "othD")), +#' merge = TRUE) +#' ## phony group variable +#' set.seed(1L) +#' x$group <- rbinom(nrow(x), 1, 0.5) +#' +#' st <- survtab(Surv(time = FUT, event = lex.Xst) ~ group, data = x, +#' surv.type = "surv.obs", +#' breaks = list(FUT = seq(0, 5, 1/12)*365.25)) +#' +#' ## NOTE: population hazard should be reported at the same scale +#' ## as time variables in your Lexis data. +#' data(popmort, package = "popEpi") +#' pm <- data.frame(popmort) +#' names(pm) <- c("sex", "CAL", "AGE", "haz") +#' ## from year to day level +#' pm$haz <- pm$haz/365.25 +#' pm$CAL <- as.Date(paste0(pm$CAL, "-01-01")) +#' pm$AGE <- pm$AGE*365.25 +#' +#' st <- survtab(Surv(time = FUT, event = lex.Xst) ~ group, data = x, +#' surv.type = "surv.rel", relsurv.method = "e2", +#' pophaz = pm, +#' breaks = list(FUT = seq(0, 5, 1/12)*365.25)) +#' } +#' @export +#' @importFrom survival Surv +survtab <- function(formula, data, adjust = NULL, breaks = NULL, + pophaz = NULL, weights = NULL, surv.type = "surv.rel", + surv.method = "hazard", relsurv.method = "e2", + subset = NULL, + conf.level = 0.95, + conf.type = "log-log", + verbose = FALSE) { + + TF <- environment() + PF <- parent.frame() + this_call <- match.call() + startTime <- proc.time() + + if (!requireNamespace("survival")) { + stop("Need package 'survival' to proceed") + } + + ## appease R CMD CHECK ------------------------------------------------------- + lex.Cst <- lex.Xst <- lex.dur <- NULL + + ## checks -------------------------------------------------------------------- + + if (missing(formula)) stop("Formula not defined!") + + checkLexisData(data) + + allScales <- attr(data, "time.scales") + splitScales <- names(breaks) + + ## ensure breaks make sense -------------------------------------------------- + oldBreaks <- attr(data, "breaks") + checkBreaksList(data, breaks = oldBreaks) + testOldBreaks <- setdiff(oldBreaks, list(NULL)) + if (is.null(breaks) && !length(testOldBreaks)) { + stop("No breaks supplied via argument 'breaks', and data has not been ", + "split in advance. Please supply a list of breaks ", + "to argument 'breaks'") + } + if (is.null(breaks)) breaks <- oldBreaks + checkBreaksList(data, breaks = breaks) + ## match break types to time scale types + ## (don't try to match time scales to breaks) + splitScales <- names(breaks) + for (k in splitScales) { + breaks[[k]] <- matchBreakTypes(data, breaks = breaks[[k]], timeScale = k) + } + + comp_pp <- FALSE + drop <- TRUE + if (surv.type == "surv.rel" && relsurv.method == "pp") comp_pp <- TRUE + if (comp_pp) drop <- FALSE + + + ## data & subset ------------------------------------------------------------- + subset <- evalLogicalSubset(data, substitute(subset)) + x <- data[subset, ]; rm(subset) + setDT(x) + forceLexisDT(x, breaks = NULL, allScales = allScales, key = TRUE) + + ## pre-eval of print & adjust ------------------------------------------------ + + adSub <- substitute(adjust) + adTest <- evalRecursive(adSub, env = x, enc = PF) + if (!is.null(adTest)) { + adSub <- adTest$argSub + adVars <- all.vars(adSub) + } else { + adSub <- substitute(NULL) + adVars <- NULL + } + + formula <- evalRecursive(formula, env = TF, enc = PF)$arg + foVars <- all.vars(formula) + + if (!inherits(formula,"formula")) { + stop("Argument 'formula' is not a formula object. Usage: e.g. ", + "Surv(fot, lex.Xst %in% 1:2) ~ sex") + } + if (length(formula) != 3L) { + stop("Argument 'formula'must be two-sided. Usage: e.g. ", + "Surv(fot, lex.Xst %in% 1:2) ~ sex") + } + ## eval print & adjust ------------------------------------------------------- + ## this adjust passed to resulting data's attributes at the end + adSub <- substitute(adjust) + adjust <- evalPopArg(data = x, arg = adSub, + enclos = PF, DT = TRUE, + recursive = TRUE) + + l <- usePopFormula(form = formula, adjust = adjust, data = x, enclos = PF, + Surv.response = "either") + prVars <- names(l$print) + adVars <- names(l$adjust) + + + ## check weights makes sense with respect to adjust -------------------------- + if (length(adVars) > 0L && !is.null(weights)) { + checkWeights(weights, adjust = l$adjust) + + } + + ## check pophaz -------------------------------------------------------------- + + if (surv.type %in% c("surv.rel", "cif.rel")) { + checkPophaz(x, pophaz, haz.name = "haz") + } + pophazVars <- setdiff(names(pophaz), "haz") + + ## only keep necessary variables --------------------------------------------- + + setcolsnull(x, keep = c("lex.id", "lex.dur", allScales, + "lex.Cst", "lex.Xst", pophazVars)) + if (length(prVars)) x[, c(prVars)] <- l$print + if (length(adVars)) x[, c(adVars)] <- l$adjust + + + ## simplify event and censoring indicators ----------------------------------- + cens.values <- event.values <- NULL + all.values <- if (is.factor(l$y$status)) levels(l$y$status) else + sort(unique(l$y$status)) + cens.values <- all.values[1L] + event.values <- setdiff(all.values, cens.values) + + if (is.numeric(l$y$status) && all(unique(l$y$status) %in% 0:1)) { + ## this should apply to situations where status coded 0/1 + ## and both 0/1 present or only 1 present + if (all(unique(l$y$status) %in% 0L)) { + stop("All status values were zero, i.e. all obs were censored. ", + "Check that you passed the correct status variable or --- if this ", + "was intended --- code the status variable to 0/1 so that 1 ", + "corresponds to the event taking place and 0 not.") + } + cens.values <- 0L + event.values <- 1L + } + + x[, lex.Cst := NULL] + x[, lex.Cst := TF$cens.values] + x[, lex.Xst := NULL] + x[, lex.Xst := l$y$status] + harmonizeStatuses(x, C = "lex.Cst", X = "lex.Xst") + + if (!surv.type %in% c("cif.obs", "surv.cause")) { + ## this simplifies computations + + x[, lex.Cst := NULL] + x[, lex.Cst := 0L] + setcolorder(x, c(intersect(names(data), names(x)), + setdiff(names(x), names(data)))) + + x[, lex.Xst := as.integer(lex.Xst %in% TF$event.values)] + cens.values <- 0L + event.values <- 1L + if (x[, sum(lex.Xst)] == 0L) { + stop("There are no events in the data. Ensure that the event argument ", + "used in Surv() makes sense.") + } + } + + ## detect which time scale used ---------------------------------------------- + + survScale <- detectSurvivalTimeScale(lex = x, values = l$y$time) + + + ## crop data to speed up computations ---------------------------------------- + cropBreaks <- breaks + if (surv.type == "surv.rel" && relsurv.method == "pp") { + ## pp-weights have to be computed from entry to follow-up till roof of breaks; + ## can only crop along the survival time scale + cropBreaks <- breaks[1L] + cb <- protectFromDrop(cropBreaks[[1L]], lower = TRUE) + cb <- c(min(cb), max(cropBreaks[[1L]])) + cropBreaks[[1L]] <- cb + } + + + intelliCrop(x = x, breaks = cropBreaks, allScales = allScales, cropStatuses = TRUE) + x <- intelliDrop(x, breaks = cropBreaks, dropNegDur = TRUE, check = TRUE) + setDT(x) + forceLexisDT(x, breaks = oldBreaks, allScales = allScales, key = TRUE) + + ## splitting ----------------------------------------------------------------- + + splitTime <- proc.time() + setDT(x) + forceLexisDT(x, breaks = oldBreaks, allScales = allScales, key = TRUE) + x <- splitMulti(x, breaks = breaks, drop = FALSE, merge = TRUE) + setDT(x) + forceLexisDT(x, breaks = breaks, allScales = allScales, key = TRUE) + if (verbose) cat("Time taken by splitting Lexis data: ", timetaken(splitTime), "\n") + + ## pophaz merge -------------------------------------------------------------- + if (!is.null(pophaz)) { + hazTime <- proc.time() + haz <- NULL ## appease R CMD CHECK + x <- cutLowMerge(x, pophaz, by = pophazVars, + mid.scales = intersect(pophazVars, allScales)) + setDT(x) + forceLexisDT(x, breaks = breaks, allScales =allScales, key = TRUE) + if (verbose) cat("Time taken by merging population hazards with split Lexis data: ", timetaken(hazTime), "\n") + } + + ## pp computation ------------------------------------------------------------ + ppNames <- d.pp <- d.pp.2 <- d.exp.pp <- ptime.pp <- + at.risk.pp <- n.cens.pp <- NULL + + if (comp_pp) { + ppTime <- proc.time() + setkeyv(x, c("lex.id", survScale)) + comp_pp_weights(x, surv.scale = survScale, + breaks = breaks[[survScale]], haz = "haz", + style = "delta", verbose = FALSE) + setDT(x) + forceLexisDT(x, breaks = breaks, allScales = allScales, key = TRUE) + if (verbose) cat("Time taken by computing Pohar-Perme weights: ", timetaken(ppTime), "\n") + + intelliCrop(x = x, breaks = breaks, allScales = allScales, cropStatuses = TRUE) + x <- intelliDrop(x, breaks = breaks, dropNegDur = TRUE, check = TRUE) + forceLexisDT(x, breaks = breaks, allScales = allScales, key = TRUE) + + ppTime <- proc.time() + pp <- comp_pp_weighted_figures(x, haz = "haz", pp = "pp", by = "lex.id") + ppNames <- makeTempVarName(x, pre = names(pp)) + x[, c(TF$ppNames) := TF$pp] ## note: TF$pp avoids conflicts + rm(pp) + + d.pp.2 <- ppNames[substr(ppNames, 1, 13) == "from0to1.pp.2"] + d.pp <- ppNames[substr(ppNames, 1, 11) == "from0to1.pp"] + d.pp <- setdiff(d.pp, d.pp.2) + d.exp.pp <- ppNames[substr(ppNames, 1, 8) == "d.exp.pp"] + ptime.pp <- ppNames[substr(ppNames, 1, 8) == "ptime.pp"] + n.cens.pp <- ppNames[substr(ppNames, 1, 11) == "from0to0.pp"] + n.cens.pp <- n.cens.pp[substr(n.cens.pp, 1,13) != "from0to0.pp.2"] + at.risk.pp <- ppNames[substr(ppNames, 1, 10) == "at.risk.pp"] + d.exp.pp <- ppNames[substr(ppNames, 1, 8) == "d.exp.pp"] + + if (verbose) cat("Time taken by computing Pohar-Perme weighted ", + "counts and person-times: ", timetaken(ppTime), "\n") + } + + d.exp <- NULL + if (surv.type %in% c("surv.rel", "cif.rel") && "haz" %in% names(x)) { + d.exp <- makeTempVarName(x, pre = "d.exp_") + x[, c(TF$d.exp) := lex.dur * haz] + } + + ## aggregation --------------------------------------------------------------- + aggreTime <- proc.time() + + ## this includes time scale to compute survivals over + aggreVars <- c(prVars, adVars, survScale) + + setDT(x) + forceLexisDT(x, breaks = breaks, allScales = allScales, key = TRUE) + if (verbose) cat("** verbose messages from aggre(): \n") + x <- aggre(x, by = aggreVars, verbose = verbose, + sum.values = c(d.exp, ppNames)) + if (verbose) cat("** end of verbose messages from aggre() \n") + setDT(x) + setattr(x, "class", c("aggre", "data.table", "data.frame")) + if (verbose) cat("Time taken by aggregating split Lexis data: ", + timetaken(aggreTime), "\n") + + ## neater column names ------------------------------------------------------- + ## in case there are zero obs that are censored + censCols <- paste0("from", cens.values, "to", cens.values) + if (all(!censCols %in% names(x))) { + x[, c(censCols) := 0L] + } + + ## e.g. fromAlivetoDead -> Dead; looks better in survtab_ag output + evCols <- paste0("from", cens.values, "to", c(cens.values, event.values)) + whEC <- which(evCols %in% names(x)) + + if (sum(whEC)) { + setnames(x, evCols[whEC], + as.character(c(cens.values, event.values)[whEC])) + } + + ## survtab_ag ---------------------------------------------------------------- + dn <- intersect(event.values, names(x)) + if (length(dn) == 0L) { + stop("Internal error: no event variables in work data. Complain to the ", + "package maintainer if you see this - unless there are no events ", + "in the data?") + } + n.cens <- intersect(cens.values, names(x)) + + if (length(prVars) == 0L) { + prVars <- "1" + } + + form <- as.formula(paste0(survScale, " ~ ", paste0(prVars, collapse = " + "))) + + if (verbose) cat("** verbose messages from survtab_ag(): \n") + + st <- survtab_ag(data = x, + formula = TF$form, + adjust = TF$adVars, + + weights = TF$weights, + + d = TF$dn, pyrs = "pyrs", n = "at.risk", + d.exp = TF$d.exp, n.cens = TF$n.cens, + + n.pp = TF$at.risk.pp, + d.pp = TF$d.pp, d.exp.pp = TF$d.exp.pp, d.pp.2 = TF$d.pp.2, + n.cens.pp = TF$n.cens.pp, pyrs.pp = TF$ptime.pp, + + surv.type = surv.type, + surv.method = surv.method, + relsurv.method = relsurv.method, + + conf.type = conf.type, + conf.level = conf.level, + + verbose = verbose) + if (verbose) cat("** end of verbose messages from survtab_ag() \n") + ## attributes ---------------------------------------------------------------- + + attributes(st)$survtab.meta$call <- this_call + attributes(st)$survtab.meta$arguments$adjust <- adjust + attributes(st)$survtab.meta$arguments$conf.type <- conf.type + attributes(st)$survtab.meta$arguments$conf.level <- conf.level + + attributes(st)$survtab.meta$arguments$surv.type <- surv.type + attributes(st)$survtab.meta$arguments$surv.method <- surv.method + attributes(st)$survtab.meta$arguments$relsurv.method <- relsurv.method + + if (verbose) cat("Total time taken by survtab: ", timetaken(startTime), "\n") + st +} +# library(Epi) +# library(popEpi) +# dt <- copy(sire)[dg_date < ex_date,] +# dt[, agegr := cut(dg_age, c(0,50,75,Inf))] +# dt[, sex := rbinom(n = .N, size = 1, prob = 0.5)] +# dt <- Lexis(data = dt, entry = list(FUT = 0, AGE = dg_age, CAL = get.yrs(dg_date)), +# exit = list(CAL = get.yrs(ex_date)), entry.status = 0L, exit.status = status, merge = TRUE) +# pm <- copy(popEpi::popmort) +# setnames(pm, c("agegroup", "year"), c("AGE", "CAL")) +# st <- survtab(data = dt, formula = Surv(FUT, lex.Xst) ~ 1, #adjust = "agegr", +# # pophaz = pm, +# surv.type = "surv.obs", +# # weights = list(agegr = c(0.2,0.4,0.4)), +# breaks = list(FUT = seq(0,5,1/12))) +# st <- survtab(dt, print = NULL, #adjust = "agegr", +# # pophaz = pm, +# surv.type = "surv.obs", +# # weights = list(agegr = c(0.2,0.4,0.4)), +# breaks = list(AGE = seq(0,100, 1))) +# st <- survtab(dt, print = NULL, #adjust = "agegr", +# pophaz = pm, +# surv.type = "surv.rel", +# relsurv.method = "pp", +# # weights = list(agegr = c(0.2,0.4,0.4)), +# breaks = list(FUT = seq(0,5,1/12))) +# st <- survtab(dt, print = NULL, adjust = c("sex","agegr"), +# pophaz = pm, +# surv.type = "surv.rel", +# relsurv.method = "pp", +# weights = list(sex = c(0.5, 0.5), agegr = c(0.2,0.4,0.4)), +# breaks = list(FUT = seq(0,5,1/12))) + + +# ag <- lexpand(sire, birth = "bi_date", entry = "dg_date", exit = "ex_date", +# status = status %in% 1:2, pophaz = popmort, pp = TRUE, +# fot = seq(0, 5, 1/12)) +# pm2 <- copy(popEpi::popmort) +# setnames(pm2, c("year", "agegroup"), c("per", "age")) +# st <- survtab(ag, print = NULL, #adjust = c("sex","agegr"), +# pophaz = pm2, +# surv.type = "surv.rel", +# relsurv.method = "pp", +# #weights = list(sex = c(0.5, 0.5), agegr = c(0.2,0.4,0.4)), +# breaks = list(fot = seq(0,5,1/12))) +detectEvents <- function(x, breaks, tol = .Machine$double.eps^0.5, by = "lex.id") { + ## INTENTION: given a Lexis object, determines which rows + ## have an event (a transition or end-point) within the window + ## determined by breaks (a list of breaks as supplied to e.g. splitMulti). + ## Usable with split and unsplit data, though it is best to do this + ## before splitting for efficiency. + ## NOTE: by should be a character vector specifying variables that identify + ## unique subjects; the idea is that each subject only has one end-point + ## and some transitions + ## NOTE: breaks should be a list of breaks or NULL; if it is NULL, + ## it is NOT checked whether observations were cut short by the breaks used. + ## observations cut short are not any kind of events. + ## OUTPUT: an integer vector coding events as follows: + ## 0: no event within breaks (row cut short by breaks or subject + ## has multiple rows, of which this is not an event) + ## 1: transition within breaks + ## 2: original end-point within breaks and no transition occured (i.e. censoring) + if (!is.data.table(x)) stop("x must be a data.table; if you see this, send the package maintainer an email") + # checkLexisData(x) + if (!inherits(x, "Lexis")) stop("data not a Lexis object") + if (!is.null(breaks)) { + checkBreaksList(x, breaks) + breaks[unlist(lapply(breaks, length)) == 0L] <- NULL + } + + ## R CMD CHECK appeasement + lex.Cst <- lex.Xst <- NULL + + tmp <- list() + oldKey <- key(x) + if (length(oldKey) == 0L) { + tmp$order <- makeTempVarName(x, pre = "order_") + on.exit(if (tmp$order %in% names(x)) setorderv(x, tmp$order), add = TRUE) + on.exit(setcolsnull(x, tmp$order, soft = TRUE), add = TRUE) + set(x, j = tmp$order, value = 1:nrow(x)) + } else on.exit(setkeyv(x, oldKey), add = TRUE) + + + setkeyv(x, c(by, names(breaks)[1L])) + setkeyv(x, by) + ## rows that actually can be events: transitions and last rows by subject + whTr <- x[, lex.Cst != lex.Xst] + whLa <- !duplicated(x, fromLast = TRUE, by=key(x)) + whEv <- whTr | whLa + + if (!is.null(breaks)) { + + splitScales <- names(breaks) + if (any(!splitScales %in% names(x))) stop("Following time scales missing from data that data was split by: ", paste0("'", setdiff(splitScales, names(x)), "'", collapse = ", ")) + + brmax <- lapply(breaks, max) + brmin <- lapply(breaks, min) + + ## detect rows residing within breaks window + for (sc in splitScales) { + z <- (x$lex.dur + x[[sc]])[whEv] + tol_sc <- if (is.double(z)) tol else 0L + + ## NOTE: if max of orig values within breaks window, then all may be events + if (!(max(z) + tol_sc < brmax[[sc]])) whEv[whEv] <- z < brmax[[sc]] - tol_sc + if (!(min(z) - tol_sc > brmin[[sc]])) whEv[whEv] <- z > brmin[[sc]] + tol_sc + + } + ## whEv now indicates rows that may be events AND which reside within breaks window. + } + + ## censored events are not transitions, but must reside within breaks window. + whCe <- whLa & !whTr & whEv + + ## need to add event indicator to data since it has been reordered, + ## reorder back old order, and return the event indicator. + tmp$ind <- makeTempVarName(x, pre = "event_indicator_") + on.exit(setcolsnull(x, delete = tmp$ind, soft = TRUE), add = TRUE) + evInd <- as.integer(whEv) + evInd <- ifelse(whCe, 2L, evInd) + set(x, j = tmp$ind, value = evInd) + + if (length(oldKey) == 0L) { + setkeyv(x, NULL) + setorderv(x, tmp$order) + set(x, j = tmp$order, value = NULL) + } else setkeyv(x, oldKey) + + + evInd <- x[[tmp$ind]] + set(x, j = tmp$ind, value = NULL) + on.exit(expr = {}, add = FALSE) ## removes on.exit expressions from earlier + + + if (!identical(oldKey, key(x))) stop("keys do not match at function end; send an email to package maintainer if you see this") + + evInd +} + + + + + + + + + diff --git a/R/survival_utility_functions.R b/R/survival_utility_functions.R index c1ad901..d10deed 100644 --- a/R/survival_utility_functions.R +++ b/R/survival_utility_functions.R @@ -1,638 +1,638 @@ -globalVariables(c("tab", "SE.A", "pp.table")) - -comp.st.surv <- function(surv.var = "p", surv.expr = "1-d/(n.eff-n.cens/2)", - SE.expr = "sqrt(p*(1-p)/(n.eff))", cumu = TRUE) { - - function(surv.table = tab, surv.by.vars = NULL) { - - tabname <- deparse(substitute(surv.table)) - # tabname <- "surv.table" - by.expr <- NULL - if (!is.null(surv.by.vars)) { - surv.by.vars <- paste0("'", surv.by.vars, "'", collapse = " ,") - by.expr <- paste0(", by = c( ", surv.by.vars, " )") - } - - surv.expr <- paste0(surv.var, " := ", surv.expr) - tab.surv.expr <- paste0(tabname, "[, ", surv.expr, by.expr, "]") - # surv.expr <- parse(text=surv.expr) - # surv.var <- parse(text=surv.var) - SE.var <- paste0("SE.", surv.var) - SE.expr <- paste0(SE.var, " := ", SE.expr) - tab.SE.expr <- paste0(tabname, "[, ", SE.expr, ", ", by.expr, "]") - # SE.expr <- parse(text=SE.expr) - # SE.var <- parse(text=SE.var) - cumuexpr <- paste0(tabname, "[, ", surv.var, " := cumprod(", surv.var, ")", by.expr, "]" ) - - - ## parent.frame(2): two steps upstream in environments - pe <- function(obj, ..., env=parent.frame(2)) { - eval(parse(text=obj), ..., envir=env) - } - - pe(tab.surv.expr) - - - if (cumu) pe(cumuexpr) - - ## standard error - pe(tab.SE.expr) - - ## zero survival leads to zero SE - minexpr <- paste0(tabname, "[",surv.var, "== 0, ", SE.var, " := 0]") - pe(minexpr) - } -} - - -comp.st.surv.obs.lif <- comp.st.surv(surv.var= "surv.obs", - surv.expr= "1-d/n.eff", - SE.expr= "surv.obs*sqrt( cumsum( d/(n.eff*(n.eff-d)) ) )", ## old : sqrt(p*(1-p)/(n.eff))" - cumu = TRUE) - -comp.st.surv.obs.haz <- comp.st.surv(surv.var= "surv.obs", - surv.expr= "exp(-delta*d/pyrs)", - SE.expr= "surv.obs*sqrt( cumsum( delta^2*d/pyrs^2 ) )", ## old: sqrt(p*(1-p)/(n.eff)) - cumu = TRUE) - - -comp.st.r.e2.haz <- comp.st.surv(surv.var = "r.e2", - surv.expr = "exp(-delta*(d-d.exp)/pyrs)", - SE.expr = "SE.surv.obs/surv.exp", - cumu=TRUE) - -comp.st.r.e2.lif <- comp.st.surv(surv.var = "r.e2", - surv.expr = "1-(d-d.exp)/(n.eff)", - SE.expr = "SE.surv.obs/surv.exp", - cumu=TRUE) - -comp.st.r.pp.haz <- comp.st.surv(surv.var = "r.pp", - surv.expr= "exp(-delta*(d.pp - d.exp.pp)/pyrs.pp)", - SE.expr = "r.pp*sqrt( cumsum( delta^2*( d.pp.2)/ pyrs.pp^2 ) )", - cumu=TRUE) - -comp.st.r.pp.lif <- comp.st.surv(surv.var = "r.pp", - surv.expr = "1-(d.pp-d.exp.pp)/(n.eff.pp)", - SE.expr = "r.pp*sqrt( cumsum( d.pp.2/(n.eff^2) ) )", - cumu=TRUE) - - -## this function will calculate confidence intervals -## for obs & rel & net survivals -#' @import stats -comp.st.conf.ints <- function(tab = pp.table, al=0.05, surv="r.pp", transform ="log-log") { - zlo <- as.character(qnorm(al/2)) - zhi <- as.character(qnorm(1-al/2)) - SE.surv <- paste0("SE.",surv) - surv.hi <- paste0(surv, ".hi") - surv.lo <- paste0(surv, ".lo") - - tmp_se <- makeTempVarName(names = names(tab), pre = "delta_method_SE_", length = 15L) - - pe <- function(...) { - eval(parse(text=paste0(...)), envir=tab) - } - - if (transform =="plain") { - ## assume S(t)~N(mu, sigma) - - ex <- paste0(surv, " ", zlo, "*", SE.surv) - tab[, (surv.lo) := pe(ex)] - ex <- paste0(surv, " +", zhi, "*", SE.surv) - tab[, (surv.hi) := pe(ex)] - - } else if (transform =="log-log") { - ## assume log(H(t))~N(mu, sigma) - ## -> delta method SE: sqrt( SE^2 * (1/(log(SURV)*SURV))^2 ) - - ex <- paste0(SE.surv,"/(abs(log(",surv,"))*",surv,")") - tab[, (tmp_se) := pe(ex)] - - ex <- paste0(surv, "^exp(", zhi, "*", tmp_se,")") - tab[, (surv.lo) := pe(ex)] - ex <- paste0(surv, "^exp(", zlo, "*", tmp_se,")") - tab[, (surv.hi) := pe(ex)] - - } else if (transform =="log") { - ## assume log(S(t))~N(mu, sigma) - ## -> delta method SE: SE/SURV - - ex <- paste0(SE.surv,"/",surv) - tab[, (tmp_se) := pe(ex)] - - ex <- paste0(surv, "*exp(", zlo, "*", tmp_se,")") - tab[, (surv.lo) := pe(ex)] - ex <- paste0(surv, "*exp(", zhi, "*", tmp_se,")") - tab[, (surv.hi) := pe(ex)] - } - if (tmp_se %in% names(tab)) tab[, (tmp_se) := NULL] - - ## zero SE means zero uncertainty means lo=hi=estimate - tab[tab[[SE.surv]] == 0, c(surv.lo, surv.hi) := .SD, .SDcols = surv] - - tab[] -} - - - -# x <- Lexis(data=sire[1,], entry = list(fot=0, per=get.yrs(dg_date), age=dg_age), -# exit=list(per=get.yrs(ex_date)), exit.status=status) -# x <- splitMulti(x, breaks = list(fot=seq(0, 5, by = 1/12), per=1994:2013, age = 0:150)) -# x[, surv.int := cut(fot, seq(0, 5, 1/12) - .Machine$double.eps^0.5, labels = FALSE)] -# x <- cutLowMerge(x, popmort, by.x = c("sex","per", "age"), -# by.y = c("sex", "year", "agegroup"), -# mid.scales = c("per", "age"), all.x = TRUE, all.y = FALSE) -# comp_pp_weights(x, surv.scale = "fot", breaks = seq(0, 5, 1/12), haz = "haz", style = "delta") - -comp_pp_weights <- function(lex, surv.scale = "fot", breaks = NULL, haz = "haz", style = "delta", verbose = FALSE) { - ppTime <- proc.time() - ## input: a split Lexis object (data.table) and the time scale to compute - ## pp weights over; lex must also contain 'haz', the population - ## (expected) hazard level for each row - TF <- environment() - - lex.dur <- lex.id <- pp <- NULL ## APPEASE R CMD CHECK - - style <- match.arg(style, c("delta", "actual")) - if (!is.data.table(lex)) stop("lex must be a data.table") - - all_names_present(lex, c(haz, surv.scale, "lex.id", "lex.dur")) - - if ("pp" %in% names(lex)) stop("Variable named 'pp' existed in data when attempting to compute pohar-perme weights; 'pp' is reserved so you should delete or rename that variable") - - - if (!identical(key(lex), c("lex.id", surv.scale))) stop("lex must be a data.table keyed by lex.id and the survival time scale") - - - breaks <- sort(unique(breaks)) - .Machine$double.eps^0.5 - - ## need a bunch of temporary variable names to compute pp weights - ## inside the data set without overwriting anything existing. - ## this will take about 0.1 secs. - tmpSI <- makeTempVarName(data = lex, pre = "surv.int_") - tmpSIstart <- makeTempVarName(data = lex, pre = "surv.int.start_") - tmpSIstop <- makeTempVarName(data = lex, pre = "surv.int.stop_") - tmpSIlength <- makeTempVarName(data = lex, pre = "surv.int.length_") - on.exit(setcolsnull(lex, delete = c(tmpSI, tmpSIstart, tmpSIstop, tmpSIlength, tmpPS, tmpPCS, tmpPCSM)), add = TRUE) - - set(lex, j = tmpSI, value = cut(lex[[surv.scale]], breaks, labels = FALSE)) - set(lex, j = tmpSIstop, value = breaks[-1][lex[[tmpSI]]]) - set(lex, j = tmpSIstart, value = breaks[-length(breaks)][lex[[tmpSI]]]) - set(lex, j = tmpSIlength, value = lex[[tmpSIstop]] - lex[[tmpSIstart]]) - - tmpPS <- makeTempVarName(data = lex, pre = "pop.surv_") - tmpPCS <- makeTempVarName(data = lex, pre = "pop.cumsurv_") - tmpPCSM <- makeTempVarName(data = lex, pre = "pop.cumsurv.mid_") - - ## conditional survs - if (verbose) condSurvTime <- proc.time() - - set(lex, j = tmpPS, value = exp(-lex[[haz]]*lex$lex.dur)) - ## till end of each interval... - lex[, c(tmpPCS) := list(exp(-cumsum(.SD[[1L]]*lex.dur))), - by = lex.id, .SDcols = c(haz, "lex.dur")] - ## till start of each interval - set(lex, j = tmpPCS, value = lex[[tmpPCS]] / lex[[tmpPS]]) - if (verbose) cat("Time taken by computing expected survivals up to start of each interval for each lex.id: ", timetaken(condSurvTime), "\n") - - ## pohar-perme weighting by expected cumulative survival. approximation: - ## cumulative survival up to either middle of remaining surv.int (not individual-specific) - ## or up to middle of subject's follow-up in each row (individual-specific) - ## difference: e.g. 2 rows within a surv.int have either the same or different pp-weights - if (style == "actual") { - set(lex, j = tmpPCSM, value = lex[[tmpPCS]] * (lex[[tmpPS]])^0.5) - } - if (style == "delta") { - if (verbose) deltaTime <- proc.time() - - setkeyv(lex, c("lex.id", tmpSI)) - ## expected survival up to middle of remaining time in surv.int - ## cumulation starting from first record for subject in each surv.int - - ## some records are the only one for a lex.id in a surv.int; these are easy - first_in_surv.int <- !duplicated(lex, fromLast = FALSE, by = key(lex)) - last_in_surv.int <- !duplicated(lex, fromLast = TRUE, by = key(lex)) - only_in_surv.int <- first_in_surv.int & last_in_surv.int - - lex[only_in_surv.int, c(tmpPCSM) := .SD[[1L]] * exp(.SD[[2L]] * (.SD[[3L]] - .SD[[4L]])/2L), - .SDcols = c(tmpPCS, haz, tmpSIstop, surv.scale)] - - ## more complicated with many records in a surv.int per lex.id - if (any(!only_in_surv.int)) { - - tmpSImid <- makeTempVarName(lex, pre = "surv.int.mid_") - dist <- makeTempVarName(lex, pre = "dist_") - on.exit(setcolsnull(lex, delete = c(dist, tmpSImid)), add = TRUE) - - ## middle point of survival interval - set(lex, j = tmpSImid, value = (lex[[tmpSIstop]] - lex[[tmpSIstart]])/2L) - - ## distance from remaining surv.int mid-point starting from start of - ## record; or at most lex.dur; for integration - set(lex, j = dist, value = pmin(lex$lex.dur, lex[[tmpSImid]])) - ## some records after mid-point can have negative fot.dist at this point - set(lex, j = dist, value = pmax(lex[[dist]], 0)) - - ## some lex.id are censored / die before mid of surv.int; last record - ## must reach its fot.dist at least up to the mid (or be zero due to above) - lex[last_in_surv.int, c(dist) := pmax((.SD[[1L]] - .SD[[2L]])/2L, 0), - .SDcols = c(tmpSIstop, surv.scale)] - - byTime <- proc.time() - - ## from start of first in surv.int till mid point - ## step by step for efficiency... - lex[!only_in_surv.int, c(tmpPCSM) := .SD[[1L]] * .SD[[2L]], .SDcols = c(haz, dist)] - lex[!only_in_surv.int, c(tmpPCSM) := lapply(.SD, sum), - .SDcols = c(tmpPCSM), by = c("lex.id", tmpSI)] - lex[!only_in_surv.int, c(tmpPCS) := .SD[1L], - by = c("lex.id", tmpSI), .SDcols = c(tmpPCS)] - lex[!only_in_surv.int, c(tmpPCSM) := .SD[[1L]] * exp(-.SD[[2L]]), - .SDcols = c(tmpPCS, tmpPCSM)] - - ## todo: alternate faster method for integration! - setcolsnull(lex, delete = c(dist)) - if (verbose) cat("Time taken by extra computation due to style 'delta': ", timetaken(deltaTime), "\n") - } - - rm(first_in_surv.int, last_in_surv.int, only_in_surv.int) - if (verbose) cat("Time taken by computation of Pohar-Perme weights: ", timetaken(ppTime), "\n") - } - - setkeyv(lex, c("lex.id", surv.scale)) - - lex[, pp := 1/.SD, .SDcols = tmpPCSM] - - invisible(lex[]) -} - -comp_pp_weighted_figures <- function(lex, haz = "haz", pp = "pp", event.ind = NULL, by = "lex.id") { - ## PURPOSE: given a split Lexis object with a column of pre-computed - ## pohar-perme weights, computes pp-weighted: - ## * person-time (lex.dur*pp) - ## * at-risk indicator - ## * event counts - ## * event counts multiplied with pp^2 - ## * expected event counts - ## events are transitions and end points as detected by detectEvents, - ## and include censorings. - ## OUTPUT: a DT of pp-weighted things. - - ## appease R CMD CHECK - lex.Cst <- lex.Xst <- NULL - - checkLexisData(lex, check.breaks = TRUE) - if (!is.data.table(lex)) stop("lex must be a data.table") - - all_names_present(lex, c(pp, haz, event.ind, by)) - - ## NOTE: we want to avoid conflicts with possible variable names in lex - ## (e.g. an existing column named ptime.pp might conflict with ptime.pp char vec) - e <- environment() - - if (is.null(event.ind)) { - event.ind <- makeTempVarName(lex, pre = "event_indicator_") - on.exit(setcolsnull(lex, delete = event.ind, soft = TRUE), add = TRUE) - set(lex, j = event.ind, value = detectEvents(lex, breaks = attr(lex, "breaks"), by = by)) - all_names_present(lex, event.ind) - } - ## data.table is probably faster in this than simply using vectors - idt <- data.table(1:2) - names(idt) <- event.ind - haveEvents <- sort(lex[idt, on = event.ind, which = TRUE]) - evtab <- lex[haveEvents, .(obs = .N), by = list(lex.Cst, lex.Xst)] - set(evtab, j = "obs", value = NULL) - - events <- paste0("from", evtab$lex.Cst, "to", evtab$lex.Xst) - ppVars <- c(paste0(events, ".pp"), "ptime.pp", "d.exp.pp") - - -# ## build table to join with lex to limit to rows with events ----------------- -# ## non-event-rows have zero valued pp-weighted event counts, naturally -# set(evtab, j = event.ind, value = NULL) -# evtab <- rbindlist(list(evtab, evtab)) -# set(evtab, j = event.ind, value = rep(1:2, each = nrow(evtab)/2L)) -# setkeyv(evtab, c("lex.Cst", "lex.Xst", event.ind)) - - ## pp-weighted events -------------------------------------------------------- - ## NOTE: a pp-weighted event is simply the pp weight where the event occurs - ## and zero otherwise (0L/1L times pp-weight) - - evN <- length(events) - evdt <- data.table(rn = rep(1:nrow(lex), times = evN)) - set(evdt, j = "eventType", value = factor(rep(1:evN, each = nrow(lex)), levels = 1:evN, labels = events)) - set(evdt, j = "pp", value = lex[[pp]]) - - - ## need to still determine which rows are not their eventType's events ------- - - rnVar <- makeTempVarName(lex, pre = "rowNum_") - on.exit(setcolsnull(lex, delete = rnVar, soft = TRUE), add = TRUE) - set(lex, j = rnVar, value = 1:nrow(lex)) - - ## row numbers in lex by event type - rowNums <- lex[haveEvents][evtab, .(rowNum = .SD[[e$rnVar]]), by = .EACHI, on = c("lex.Cst", "lex.Xst"), .SDcols = rnVar] - ## multiply to accommodate expanded evdt data - - rowNum <- NULL ## appease R CMD CHECK - rowNums[, rowNum := rowNum+(.GRP-1L)*nrow(lex), by = list(lex.Cst, lex.Xst)] - noEvents <- setdiff(1:nrow(evdt), rowNums$rowNum) - - set(evdt, i = noEvents, j = "pp", value = 0) - - evdt <- dcast.data.table(evdt, rn ~ eventType, value.var = "pp") - - setorderv(evdt, "rn") - set(evdt, j = "rn", value = NULL) - - ## ptime.pp & d.exp.pp ------------------------------------------------------- - set(evdt, j = "ptime.pp", value = lex$lex.dur * lex[[pp]]) - set(evdt, j = "d.exp.pp", value = lex$lex.dur * lex[[haz]] * lex[[pp]]) - - setcolorder(evdt, c("ptime.pp", "d.exp.pp", sort(events))) - setnames(evdt, events, paste0(events, ".pp")) - - ## pp-weighted at-risk indicators -------------------------------------------- - ## these will be n.pp at the aggregate level. - set(evdt, j = "at.risk.pp", value = lex[[pp]]*1L) - - ## events multiplied with pp-weight again ------------------------------------ - ## (i.e. event times pp squared) - - set(evdt, j = paste0(events, ".pp.2"), - value = evdt[, .SD, .SDcols = paste0(events, ".pp")]*lex[[pp]]) - - return(evdt[]) - -} - - - -test_empty_surv_ints <- function(x, by = NULL, sum.over = NULL, test.var = "pyrs", show.by = NULL) { - - x <- copy(x) - oc <- class(x) - setDT(x) - setattr(x, "class", oc) - - surv.int <- NULL ## APPEASE R CMD CHECK - - all_names_present(x, by, msg = "Missing variable(s) %%VARS%% from data when inspected for empty survival intervals. If you see this, send a message to the package maintainer.") - all_names_present(x, sum.over, msg = "Missing variable(s) %%VARS%% from data when inspected for empty survival intervals. If you see this, send a message to the package maintainer.") - all_names_present(x, test.var, msg = "Missing variable(s) %%VARS%% from data when inspected for empty survival intervals. If you see this, send a message to the package maintainer.") - - if (any(!sum.over %in% by)) stop("sum.over must be a subset of by.") - if (length(show.by) == 0L) show.by <- by - if (length(by) != length(show.by)) { - stop("Internal error: length(sum.over) != length(show.sum.over). ", - "If you see this, complain to the package maintainer.") - } - - wh_sum.to <- !by %in% sum.over - sum.to <- by[wh_sum.to] - show.sum.to <- show.by[wh_sum.to] - - if (length(sum.to) == 0L) sum.to <- show.sum.to <- NULL - - tmpTV <- makeTempVarName(x, pre = "testValues_") - tmpDiff <- makeTempVarName(x, pre = "diff_") - on.exit(setcolsnull(x, delete = c(tmpTV, tmpDiff))) - - ## first check empty surv.ints are all consecutive... - ## consecutiveness: e.g. out of 10 surv ints, 6-10 are empty. - ## non-consecutiveness: e.g. out of 10 surv ints, 6-8 are empty. - ## (then 9-10 will have NA estimates as well.) - setkeyv(x, c(by, "surv.int")) - - ct <- x[, lapply(.SD, sum), keyby = c(sum.to, "surv.int"), .SDcols = test.var] - setnames(ct, length(ct), tmpTV) - ct <- ct[ct[[tmpTV]] > 0L, diff(surv.int), keyby = eval(sum.to)] - setnames(ct, length(ct), tmpDiff) - ct <- ct[ct[[tmpDiff]] > 1L] - ## THE IDEA: if the difference in the number of the survival interval - ## is > 1, it means there is at least one row between two non-empty - ## intervals, i.e. non-consecutively. - - ## we keep non-consecutively bad surv.int stratas in entirety for inspection - if (nrow(ct) > 0L) { - msg <- paste0("The total person-time was zero in some survival intervals") - - if (!is.null(sum.to)) { - msg <- paste0(msg, ", when summed to the variable(s) ", - paste0("'", show.sum.to, "'", collapse = ", "), - " (i.e. over all other variables, if any)") - } else { - msg <- paste0(msg, " summed to the margins (over any stratifying ", - "/ adjusting variables)") - } - - msg <- paste0(msg, " _non-consecutively_, i.e. some intervals after an ", - "empty interval had person-time in them. ", - "Keeping all survival ", - "intervals with some estimates as NA for inspection.") - message(msg) - } else { - ## we leave out intervals that are empty consecutively (e.g. all from 5-10) - x[, c(tmpTV) := lapply(.SD, sum), by=c(sum.to, "surv.int"), .SDcols = test.var] - x <- x[x[[tmpTV]] > 0L] - setcolsnull(x, tmpTV) - } - - x -} - - - - -comp_e1 <- function(x, breaks, pophaz, survScale, by = NULL, id = "lex.id", immortal = TRUE, verbose = FALSE) { - ## INTENTION: given a Lexis data set x, - ## computes Ederer I expected survival curves till end of follow-up - ## by 'by' unless individual = TRUE. - TF <- environment() - - lex.dur <- haz <- surv.exp <- NULL # R CMD CHECK appeasement - ## check --------------------------------------------------------------------- - checkLexisData(x) - checkBreaksList(x, breaks) - ph <- data.table(pophaz) - tmpHaz <- makeTempVarName(x, pre = "pop_haz_") - setnames(ph, "haz", tmpHaz) - checkPophaz(x, ph, haz.name = tmpHaz) - - byErr <- paste0("Internal error (probably): work data did not have ", - "variable(s) %%VARS%%. If your supplied data has them, ", - "complain to the package maintainer.") - all_names_present(x, c(by, id, survScale), msg = byErr) - - if (length(id) != 1L) { - stop("Argument id must be of length 1.") - } - ## split --------------------------------------------------------------------- - pt <- proc.time() - oldBreaks <- attr(x, "breaks") - allScales <- attr(x, "time.scales") - if (!survScale %in% allScales) { - stop("survScale '", survScale, "' not a time scale in the Lexis object. ", - "(Possibly internal error - ensure you have that time scale in data. ", - "If not, complain to the package maintainer.") - } - keepVars <- c(allScales, "lex.dur", "lex.id", "lex.Cst", - "lex.Xst", by, id, setdiff(names(ph), tmpHaz)) - keepVars <- unique(keepVars) - y <- subsetDTorDF(x, select = keepVars) - y <- setDT(copy(y)) - forceLexisDT(y, breaks = oldBreaks, allScales = allScales, key = TRUE) - - ## won't use statuses for anything - y[, c("lex.Cst", "lex.Xst") := NULL] - y[, c("lex.Cst", "lex.Xst") := 0L] - - if (immortal) { - ## set lex.dur to infinite. this assumes that the subjects never leave - ## follow-up (which Ederer I assumes) - storage.mode(y$lex.dur) <- "double" - id_last <- !duplicated(y, by = "lex.id", fromLast = TRUE) - y[TF$id_last, lex.dur := Inf] - } - - y <- intelliCrop(y, breaks = breaks, allScales = allScales) - y <- intelliDrop(y, breaks = breaks) - setDT(y) - forceLexisDT(y, breaks = oldBreaks, allScales = allScales, key = TRUE) - setkeyv(y, c(id, survScale)) - - y <- splitMulti(y, breaks = breaks, drop = FALSE, merge = TRUE) - - if (verbose) cat("Time taken by splitting: ", timetaken(pt), ".\n", sep = "") - - ## merge pop haz ------------------------------------------------------------- - pt <- proc.time() - mergeVars <- intersect(names(y), names(ph)) - mergeScales <- intersect(allScales, mergeVars) - if (length(mergeScales) == 0L) mergeScales <- NULL - mergeCats <- setdiff(mergeVars, mergeScales) - if (length(mergeCats) == 0L) mergeCats <- NULL - - y <- cutLowMerge(y, ph, by = mergeVars, - mid.scales = mergeScales, old.nums = TRUE, - all.x = TRUE, all.y = FALSE) - setDT(y) - if (verbose) cat("Time taken by merging pophaz: ", timetaken(pt), ".\n", sep = "") - - ## ederer I computation ------------------------------------------------------ - ## prep temp surv.int var - tmpSI <- makeTempVarName(x, pre = "surv_int_") - setkeyv(y, c(by, id, allScales[1L])) - y[, c(tmpSI) := cut(y[[survScale]], breaks[[survScale]], - right=FALSE,labels=FALSE)] - setkeyv(y, c(by, tmpSI)) - - ## EDERER I: integral of the weighted average expected hazard, - ## where the weights are the subject-specific expected survival - ## probabilities. - ## 1) compute integral of hazard over an interval t_i by id - ## (NOT cumulative hazard from zero till end of interval t_i) - ## This sums over multiple rows a subject may have within one - ## survival interval due to splitting by multiple time scales. - pt <- proc.time() - set(y, j = tmpHaz, value = y[[tmpHaz]]*y$lex.dur) - y <- y[, lapply(.SD, sum), keyby = eval(unique(c(by, id, tmpSI))), - .SDcols = c(tmpHaz)] - setnames(y, ncol(y), tmpHaz) - - ## reverse temp names - need to be able to refer to haz without temp var - ## to enable correct cumsum() below - avoid <- c(names(y), tmpHaz, "surv.exp") - tmpBy <- makeTempVarName(names = avoid, pre = by) - tmpID <- makeTempVarName(names = c(avoid, tmpBy), pre = id) - if (length(by)) { - setnames(y, by, tmpBy) - if (id %in% by) tmpID <- id <- tmpBy[by == id] - } else { - tmpBy <- by <- NULL - } - setnames(y, id, tmpID) - setnames(y, tmpHaz, "haz") - if (verbose) cat("Time taken by 1): ", timetaken(pt), ".\n", sep = "") - - ## 2) expected cum.haz. over intervals t_1 -> t_i by id... - ## (no cumulative exp.surv yet) - pt <- proc.time() - y[, surv.exp := cumsum(haz), by = eval(tmpID)] - - if (verbose) cat("Time taken by 2): ", timetaken(pt), ".\n", sep = "") - ## 3) cumulative surv.exp till end of interval t_i by id... - pt <- proc.time() - y[, surv.exp := exp(-surv.exp)] - - if (verbose) cat("Time taken by 3): ", timetaken(pt), ".\n", sep = "") - - ## 4) The Ederer I expected (marginal) survivals for intervals t_i - pt <- proc.time() - y <- y[, .(surv.exp = mean(surv.exp)), by = eval(c(tmpBy, tmpSI))] - if (verbose) cat("Time taken by 4): ", timetaken(pt), ".\n", sep = "") - - if ("surv.exp" %in% by) { - by[by == "surv.exp"] <- makeTempVarName(y, pre = "surv.exp") - } - if (length(by)) setnames(y, tmpBy, by) - - setcolorder(y, c(by, tmpSI, "surv.exp")) - setkeyv(y, c(by, tmpSI)) - setnames(y, tmpSI, survScale) - br <- breaks[[survScale]] - br <- br[-1] - y[, c(survScale) := br[y[[survScale]]]] - - - y[] -} - - - - -detectSurvivalTimeScale <- function(lex, values) { - - checkLexisData(lex) - - allScales <- attr(lex, "time.scales") - - allScVals <- lapply(allScales, function(ch) lex[[ch]]) - names(allScVals) <- allScales - - whSurvScale <- lapply(allScVals, function(col) { - identical(col, values) - }) - whSurvScale <- unlist(whSurvScale) - - if (sum(whSurvScale) == 0L) { - whSurvScale <- lapply(allScVals, function(col) { - isTRUE({ - all.equal(col, values, scale = 1L, - check.attributes = FALSE, - tolerance = .Machine$double.eps ^ 0.5) - }) - }) - whSurvScale <- unlist(whSurvScale) - } - if (sum(whSurvScale) == 0L) { - - dt <- as.data.table(allScVals) - dt <- cbind(dt, data.table(testValues = values)) - on.exit(print(dt)) - - stop("Could not determine which time scale was used. The formula MUST ", - "include the time scale used within a Surv() call (or a Surv object),", - " e.g. Surv(FUT, lex.Xst) ~ sex. Note that the 'time' argument is ", - "effectively (and exceptionally) used here to denote the times at ", - "the beginning of follow-up to identify the time scale existing in ", - "the supplied data to use. If you are sure you are mentioning a ", - "time scale in the formula in this manner, complain to the ", - "package maintainer. The table printed below contains the time ", - "scales tested against and the values that were supplied as the last ", - "column.") - } - survScale <- allScales[whSurvScale] - survScale - -} +globalVariables(c("tab", "SE.A", "pp.table")) + +comp.st.surv <- function(surv.var = "p", surv.expr = "1-d/(n.eff-n.cens/2)", + SE.expr = "sqrt(p*(1-p)/(n.eff))", cumu = TRUE) { + + function(surv.table = tab, surv.by.vars = NULL) { + + tabname <- deparse(substitute(surv.table)) + # tabname <- "surv.table" + by.expr <- NULL + if (!is.null(surv.by.vars)) { + surv.by.vars <- paste0("'", surv.by.vars, "'", collapse = " ,") + by.expr <- paste0(", by = c( ", surv.by.vars, " )") + } + + surv.expr <- paste0(surv.var, " := ", surv.expr) + tab.surv.expr <- paste0(tabname, "[, ", surv.expr, by.expr, "]") + # surv.expr <- parse(text=surv.expr) + # surv.var <- parse(text=surv.var) + SE.var <- paste0("SE.", surv.var) + SE.expr <- paste0(SE.var, " := ", SE.expr) + tab.SE.expr <- paste0(tabname, "[, ", SE.expr, ", ", by.expr, "]") + # SE.expr <- parse(text=SE.expr) + # SE.var <- parse(text=SE.var) + cumuexpr <- paste0(tabname, "[, ", surv.var, " := cumprod(", surv.var, ")", by.expr, "]" ) + + + ## parent.frame(2): two steps upstream in environments + pe <- function(obj, ..., env=parent.frame(2)) { + eval(parse(text=obj), ..., envir=env) + } + + pe(tab.surv.expr) + + + if (cumu) pe(cumuexpr) + + ## standard error + pe(tab.SE.expr) + + ## zero survival leads to zero SE + minexpr <- paste0(tabname, "[",surv.var, "== 0, ", SE.var, " := 0]") + pe(minexpr) + } +} + + +comp.st.surv.obs.lif <- comp.st.surv(surv.var= "surv.obs", + surv.expr= "1-d/n.eff", + SE.expr= "surv.obs*sqrt( cumsum( d/(n.eff*(n.eff-d)) ) )", ## old : sqrt(p*(1-p)/(n.eff))" + cumu = TRUE) + +comp.st.surv.obs.haz <- comp.st.surv(surv.var= "surv.obs", + surv.expr= "exp(-delta*d/pyrs)", + SE.expr= "surv.obs*sqrt( cumsum( delta^2*d/pyrs^2 ) )", ## old: sqrt(p*(1-p)/(n.eff)) + cumu = TRUE) + + +comp.st.r.e2.haz <- comp.st.surv(surv.var = "r.e2", + surv.expr = "exp(-delta*(d-d.exp)/pyrs)", + SE.expr = "SE.surv.obs/surv.exp", + cumu=TRUE) + +comp.st.r.e2.lif <- comp.st.surv(surv.var = "r.e2", + surv.expr = "1-(d-d.exp)/(n.eff)", + SE.expr = "SE.surv.obs/surv.exp", + cumu=TRUE) + +comp.st.r.pp.haz <- comp.st.surv(surv.var = "r.pp", + surv.expr= "exp(-delta*(d.pp - d.exp.pp)/pyrs.pp)", + SE.expr = "r.pp*sqrt( cumsum( delta^2*( d.pp.2)/ pyrs.pp^2 ) )", + cumu=TRUE) + +comp.st.r.pp.lif <- comp.st.surv(surv.var = "r.pp", + surv.expr = "1-(d.pp-d.exp.pp)/(n.eff.pp)", + SE.expr = "r.pp*sqrt( cumsum( d.pp.2/(n.eff^2) ) )", + cumu=TRUE) + + +## this function will calculate confidence intervals +## for obs & rel & net survivals +#' @import stats +comp.st.conf.ints <- function(tab = pp.table, al=0.05, surv="r.pp", transform ="log-log") { + zlo <- as.character(qnorm(al/2)) + zhi <- as.character(qnorm(1-al/2)) + SE.surv <- paste0("SE.",surv) + surv.hi <- paste0(surv, ".hi") + surv.lo <- paste0(surv, ".lo") + + tmp_se <- makeTempVarName(names = names(tab), pre = "delta_method_SE_", length = 15L) + + pe <- function(...) { + eval(parse(text=paste0(...)), envir=tab) + } + + if (transform =="plain") { + ## assume S(t)~N(mu, sigma) + + ex <- paste0(surv, " ", zlo, "*", SE.surv) + tab[, (surv.lo) := pe(ex)] + ex <- paste0(surv, " +", zhi, "*", SE.surv) + tab[, (surv.hi) := pe(ex)] + + } else if (transform =="log-log") { + ## assume log(H(t))~N(mu, sigma) + ## -> delta method SE: sqrt( SE^2 * (1/(log(SURV)*SURV))^2 ) + + ex <- paste0(SE.surv,"/(abs(log(",surv,"))*",surv,")") + tab[, (tmp_se) := pe(ex)] + + ex <- paste0(surv, "^exp(", zhi, "*", tmp_se,")") + tab[, (surv.lo) := pe(ex)] + ex <- paste0(surv, "^exp(", zlo, "*", tmp_se,")") + tab[, (surv.hi) := pe(ex)] + + } else if (transform =="log") { + ## assume log(S(t))~N(mu, sigma) + ## -> delta method SE: SE/SURV + + ex <- paste0(SE.surv,"/",surv) + tab[, (tmp_se) := pe(ex)] + + ex <- paste0(surv, "*exp(", zlo, "*", tmp_se,")") + tab[, (surv.lo) := pe(ex)] + ex <- paste0(surv, "*exp(", zhi, "*", tmp_se,")") + tab[, (surv.hi) := pe(ex)] + } + if (tmp_se %in% names(tab)) tab[, (tmp_se) := NULL] + + ## zero SE means zero uncertainty means lo=hi=estimate + tab[tab[[SE.surv]] == 0, c(surv.lo, surv.hi) := .SD, .SDcols = surv] + + tab[] +} + + + +# x <- Lexis(data=sire[1,], entry = list(fot=0, per=get.yrs(dg_date), age=dg_age), +# exit=list(per=get.yrs(ex_date)), exit.status=status) +# x <- splitMulti(x, breaks = list(fot=seq(0, 5, by = 1/12), per=1994:2013, age = 0:150)) +# x[, surv.int := cut(fot, seq(0, 5, 1/12) - .Machine$double.eps^0.5, labels = FALSE)] +# x <- cutLowMerge(x, popmort, by.x = c("sex","per", "age"), +# by.y = c("sex", "year", "agegroup"), +# mid.scales = c("per", "age"), all.x = TRUE, all.y = FALSE) +# comp_pp_weights(x, surv.scale = "fot", breaks = seq(0, 5, 1/12), haz = "haz", style = "delta") + +comp_pp_weights <- function(lex, surv.scale = "fot", breaks = NULL, haz = "haz", style = "delta", verbose = FALSE) { + ppTime <- proc.time() + ## input: a split Lexis object (data.table) and the time scale to compute + ## pp weights over; lex must also contain 'haz', the population + ## (expected) hazard level for each row + TF <- environment() + + lex.dur <- lex.id <- pp <- NULL ## APPEASE R CMD CHECK + + style <- match.arg(style, c("delta", "actual")) + if (!is.data.table(lex)) stop("lex must be a data.table") + + all_names_present(lex, c(haz, surv.scale, "lex.id", "lex.dur")) + + if ("pp" %in% names(lex)) stop("Variable named 'pp' existed in data when attempting to compute pohar-perme weights; 'pp' is reserved so you should delete or rename that variable") + + + if (!identical(key(lex), c("lex.id", surv.scale))) stop("lex must be a data.table keyed by lex.id and the survival time scale") + + + breaks <- sort(unique(breaks)) - .Machine$double.eps^0.5 + + ## need a bunch of temporary variable names to compute pp weights + ## inside the data set without overwriting anything existing. + ## this will take about 0.1 secs. + tmpSI <- makeTempVarName(data = lex, pre = "surv.int_") + tmpSIstart <- makeTempVarName(data = lex, pre = "surv.int.start_") + tmpSIstop <- makeTempVarName(data = lex, pre = "surv.int.stop_") + tmpSIlength <- makeTempVarName(data = lex, pre = "surv.int.length_") + on.exit(setcolsnull(lex, delete = c(tmpSI, tmpSIstart, tmpSIstop, tmpSIlength, tmpPS, tmpPCS, tmpPCSM)), add = TRUE) + + set(lex, j = tmpSI, value = cut(lex[[surv.scale]], breaks, labels = FALSE)) + set(lex, j = tmpSIstop, value = breaks[-1][lex[[tmpSI]]]) + set(lex, j = tmpSIstart, value = breaks[-length(breaks)][lex[[tmpSI]]]) + set(lex, j = tmpSIlength, value = lex[[tmpSIstop]] - lex[[tmpSIstart]]) + + tmpPS <- makeTempVarName(data = lex, pre = "pop.surv_") + tmpPCS <- makeTempVarName(data = lex, pre = "pop.cumsurv_") + tmpPCSM <- makeTempVarName(data = lex, pre = "pop.cumsurv.mid_") + + ## conditional survs + if (verbose) condSurvTime <- proc.time() + + set(lex, j = tmpPS, value = exp(-lex[[haz]]*lex$lex.dur)) + ## till end of each interval... + lex[, c(tmpPCS) := list(exp(-cumsum(.SD[[1L]]*lex.dur))), + by = lex.id, .SDcols = c(haz, "lex.dur")] + ## till start of each interval + set(lex, j = tmpPCS, value = lex[[tmpPCS]] / lex[[tmpPS]]) + if (verbose) cat("Time taken by computing expected survivals up to start of each interval for each lex.id: ", timetaken(condSurvTime), "\n") + + ## pohar-perme weighting by expected cumulative survival. approximation: + ## cumulative survival up to either middle of remaining surv.int (not individual-specific) + ## or up to middle of subject's follow-up in each row (individual-specific) + ## difference: e.g. 2 rows within a surv.int have either the same or different pp-weights + if (style == "actual") { + set(lex, j = tmpPCSM, value = lex[[tmpPCS]] * (lex[[tmpPS]])^0.5) + } + if (style == "delta") { + if (verbose) deltaTime <- proc.time() + + setkeyv(lex, c("lex.id", tmpSI)) + ## expected survival up to middle of remaining time in surv.int + ## cumulation starting from first record for subject in each surv.int + + ## some records are the only one for a lex.id in a surv.int; these are easy + first_in_surv.int <- !duplicated(lex, fromLast = FALSE, by = key(lex)) + last_in_surv.int <- !duplicated(lex, fromLast = TRUE, by = key(lex)) + only_in_surv.int <- first_in_surv.int & last_in_surv.int + + lex[only_in_surv.int, c(tmpPCSM) := .SD[[1L]] * exp(.SD[[2L]] * (.SD[[3L]] - .SD[[4L]])/2L), + .SDcols = c(tmpPCS, haz, tmpSIstop, surv.scale)] + + ## more complicated with many records in a surv.int per lex.id + if (any(!only_in_surv.int)) { + + tmpSImid <- makeTempVarName(lex, pre = "surv.int.mid_") + dist <- makeTempVarName(lex, pre = "dist_") + on.exit(setcolsnull(lex, delete = c(dist, tmpSImid)), add = TRUE) + + ## middle point of survival interval + set(lex, j = tmpSImid, value = (lex[[tmpSIstop]] - lex[[tmpSIstart]])/2L) + + ## distance from remaining surv.int mid-point starting from start of + ## record; or at most lex.dur; for integration + set(lex, j = dist, value = pmin(lex$lex.dur, lex[[tmpSImid]])) + ## some records after mid-point can have negative fot.dist at this point + set(lex, j = dist, value = pmax(lex[[dist]], 0)) + + ## some lex.id are censored / die before mid of surv.int; last record + ## must reach its fot.dist at least up to the mid (or be zero due to above) + lex[last_in_surv.int, c(dist) := pmax((.SD[[1L]] - .SD[[2L]])/2L, 0), + .SDcols = c(tmpSIstop, surv.scale)] + + byTime <- proc.time() + + ## from start of first in surv.int till mid point + ## step by step for efficiency... + lex[!only_in_surv.int, c(tmpPCSM) := .SD[[1L]] * .SD[[2L]], .SDcols = c(haz, dist)] + lex[!only_in_surv.int, c(tmpPCSM) := lapply(.SD, sum), + .SDcols = c(tmpPCSM), by = c("lex.id", tmpSI)] + lex[!only_in_surv.int, c(tmpPCS) := .SD[1L], + by = c("lex.id", tmpSI), .SDcols = c(tmpPCS)] + lex[!only_in_surv.int, c(tmpPCSM) := .SD[[1L]] * exp(-.SD[[2L]]), + .SDcols = c(tmpPCS, tmpPCSM)] + + ## todo: alternate faster method for integration! + setcolsnull(lex, delete = c(dist)) + if (verbose) cat("Time taken by extra computation due to style 'delta': ", timetaken(deltaTime), "\n") + } + + rm(first_in_surv.int, last_in_surv.int, only_in_surv.int) + if (verbose) cat("Time taken by computation of Pohar-Perme weights: ", timetaken(ppTime), "\n") + } + + setkeyv(lex, c("lex.id", surv.scale)) + + lex[, pp := 1/.SD, .SDcols = tmpPCSM] + + invisible(lex[]) +} + +comp_pp_weighted_figures <- function(lex, haz = "haz", pp = "pp", event.ind = NULL, by = "lex.id") { + ## PURPOSE: given a split Lexis object with a column of pre-computed + ## pohar-perme weights, computes pp-weighted: + ## * person-time (lex.dur*pp) + ## * at-risk indicator + ## * event counts + ## * event counts multiplied with pp^2 + ## * expected event counts + ## events are transitions and end points as detected by detectEvents, + ## and include censorings. + ## OUTPUT: a DT of pp-weighted things. + + ## appease R CMD CHECK + lex.Cst <- lex.Xst <- NULL + + checkLexisData(lex, check.breaks = TRUE) + if (!is.data.table(lex)) stop("lex must be a data.table") + + all_names_present(lex, c(pp, haz, event.ind, by)) + + ## NOTE: we want to avoid conflicts with possible variable names in lex + ## (e.g. an existing column named ptime.pp might conflict with ptime.pp char vec) + e <- environment() + + if (is.null(event.ind)) { + event.ind <- makeTempVarName(lex, pre = "event_indicator_") + on.exit(setcolsnull(lex, delete = event.ind, soft = TRUE), add = TRUE) + set(lex, j = event.ind, value = detectEvents(lex, breaks = attr(lex, "breaks"), by = by)) + all_names_present(lex, event.ind) + } + ## data.table is probably faster in this than simply using vectors + idt <- data.table(1:2) + names(idt) <- event.ind + haveEvents <- sort(lex[idt, on = event.ind, which = TRUE]) + evtab <- lex[haveEvents, .(obs = .N), by = list(lex.Cst, lex.Xst)] + set(evtab, j = "obs", value = NULL) + + events <- paste0("from", evtab$lex.Cst, "to", evtab$lex.Xst) + ppVars <- c(paste0(events, ".pp"), "ptime.pp", "d.exp.pp") + + +# ## build table to join with lex to limit to rows with events ----------------- +# ## non-event-rows have zero valued pp-weighted event counts, naturally +# set(evtab, j = event.ind, value = NULL) +# evtab <- rbindlist(list(evtab, evtab)) +# set(evtab, j = event.ind, value = rep(1:2, each = nrow(evtab)/2L)) +# setkeyv(evtab, c("lex.Cst", "lex.Xst", event.ind)) + + ## pp-weighted events -------------------------------------------------------- + ## NOTE: a pp-weighted event is simply the pp weight where the event occurs + ## and zero otherwise (0L/1L times pp-weight) + + evN <- length(events) + evdt <- data.table(rn = rep(1:nrow(lex), times = evN)) + set(evdt, j = "eventType", value = factor(rep(1:evN, each = nrow(lex)), levels = 1:evN, labels = events)) + set(evdt, j = "pp", value = lex[[pp]]) + + + ## need to still determine which rows are not their eventType's events ------- + + rnVar <- makeTempVarName(lex, pre = "rowNum_") + on.exit(setcolsnull(lex, delete = rnVar, soft = TRUE), add = TRUE) + set(lex, j = rnVar, value = 1:nrow(lex)) + + ## row numbers in lex by event type + rowNums <- lex[haveEvents][evtab, .(rowNum = .SD[[e$rnVar]]), by = .EACHI, on = c("lex.Cst", "lex.Xst"), .SDcols = rnVar] + ## multiply to accommodate expanded evdt data + + rowNum <- NULL ## appease R CMD CHECK + rowNums[, rowNum := rowNum+(.GRP-1L)*nrow(lex), by = list(lex.Cst, lex.Xst)] + noEvents <- setdiff(1:nrow(evdt), rowNums$rowNum) + + set(evdt, i = noEvents, j = "pp", value = 0) + + evdt <- dcast.data.table(evdt, rn ~ eventType, value.var = "pp") + + setorderv(evdt, "rn") + set(evdt, j = "rn", value = NULL) + + ## ptime.pp & d.exp.pp ------------------------------------------------------- + set(evdt, j = "ptime.pp", value = lex$lex.dur * lex[[pp]]) + set(evdt, j = "d.exp.pp", value = lex$lex.dur * lex[[haz]] * lex[[pp]]) + + setcolorder(evdt, c("ptime.pp", "d.exp.pp", sort(events))) + setnames(evdt, events, paste0(events, ".pp")) + + ## pp-weighted at-risk indicators -------------------------------------------- + ## these will be n.pp at the aggregate level. + set(evdt, j = "at.risk.pp", value = lex[[pp]]*1L) + + ## events multiplied with pp-weight again ------------------------------------ + ## (i.e. event times pp squared) + + set(evdt, j = paste0(events, ".pp.2"), + value = evdt[, .SD, .SDcols = paste0(events, ".pp")]*lex[[pp]]) + + return(evdt[]) + +} + + + +test_empty_surv_ints <- function(x, by = NULL, sum.over = NULL, test.var = "pyrs", show.by = NULL) { + + x <- copy(x) + oc <- class(x) + setDT(x) + setattr(x, "class", oc) + + surv.int <- NULL ## APPEASE R CMD CHECK + + all_names_present(x, by, msg = "Missing variable(s) %%VARS%% from data when inspected for empty survival intervals. If you see this, send a message to the package maintainer.") + all_names_present(x, sum.over, msg = "Missing variable(s) %%VARS%% from data when inspected for empty survival intervals. If you see this, send a message to the package maintainer.") + all_names_present(x, test.var, msg = "Missing variable(s) %%VARS%% from data when inspected for empty survival intervals. If you see this, send a message to the package maintainer.") + + if (any(!sum.over %in% by)) stop("sum.over must be a subset of by.") + if (length(show.by) == 0L) show.by <- by + if (length(by) != length(show.by)) { + stop("Internal error: length(sum.over) != length(show.sum.over). ", + "If you see this, complain to the package maintainer.") + } + + wh_sum.to <- !by %in% sum.over + sum.to <- by[wh_sum.to] + show.sum.to <- show.by[wh_sum.to] + + if (length(sum.to) == 0L) sum.to <- show.sum.to <- NULL + + tmpTV <- makeTempVarName(x, pre = "testValues_") + tmpDiff <- makeTempVarName(x, pre = "diff_") + on.exit(setcolsnull(x, delete = c(tmpTV, tmpDiff))) + + ## first check empty surv.ints are all consecutive... + ## consecutiveness: e.g. out of 10 surv ints, 6-10 are empty. + ## non-consecutiveness: e.g. out of 10 surv ints, 6-8 are empty. + ## (then 9-10 will have NA estimates as well.) + setkeyv(x, c(by, "surv.int")) + + ct <- x[, lapply(.SD, sum), keyby = c(sum.to, "surv.int"), .SDcols = test.var] + setnames(ct, length(ct), tmpTV) + ct <- ct[ct[[tmpTV]] > 0L, diff(surv.int), keyby = eval(sum.to)] + setnames(ct, length(ct), tmpDiff) + ct <- ct[ct[[tmpDiff]] > 1L] + ## THE IDEA: if the difference in the number of the survival interval + ## is > 1, it means there is at least one row between two non-empty + ## intervals, i.e. non-consecutively. + + ## we keep non-consecutively bad surv.int stratas in entirety for inspection + if (nrow(ct) > 0L) { + msg <- paste0("The total person-time was zero in some survival intervals") + + if (!is.null(sum.to)) { + msg <- paste0(msg, ", when summed to the variable(s) ", + paste0("'", show.sum.to, "'", collapse = ", "), + " (i.e. over all other variables, if any)") + } else { + msg <- paste0(msg, " summed to the margins (over any stratifying ", + "/ adjusting variables)") + } + + msg <- paste0(msg, " _non-consecutively_, i.e. some intervals after an ", + "empty interval had person-time in them. ", + "Keeping all survival ", + "intervals with some estimates as NA for inspection.") + message(msg) + } else { + ## we leave out intervals that are empty consecutively (e.g. all from 5-10) + x[, c(tmpTV) := lapply(.SD, sum), by=c(sum.to, "surv.int"), .SDcols = test.var] + x <- x[x[[tmpTV]] > 0L] + setcolsnull(x, tmpTV) + } + + x +} + + + + +comp_e1 <- function(x, breaks, pophaz, survScale, by = NULL, id = "lex.id", immortal = TRUE, verbose = FALSE) { + ## INTENTION: given a Lexis data set x, + ## computes Ederer I expected survival curves till end of follow-up + ## by 'by' unless individual = TRUE. + TF <- environment() + + lex.dur <- haz <- surv.exp <- NULL # R CMD CHECK appeasement + ## check --------------------------------------------------------------------- + checkLexisData(x) + checkBreaksList(x, breaks) + ph <- data.table(pophaz) + tmpHaz <- makeTempVarName(x, pre = "pop_haz_") + setnames(ph, "haz", tmpHaz) + checkPophaz(x, ph, haz.name = tmpHaz) + + byErr <- paste0("Internal error (probably): work data did not have ", + "variable(s) %%VARS%%. If your supplied data has them, ", + "complain to the package maintainer.") + all_names_present(x, c(by, id, survScale), msg = byErr) + + if (length(id) != 1L) { + stop("Argument id must be of length 1.") + } + ## split --------------------------------------------------------------------- + pt <- proc.time() + oldBreaks <- attr(x, "breaks") + allScales <- attr(x, "time.scales") + if (!survScale %in% allScales) { + stop("survScale '", survScale, "' not a time scale in the Lexis object. ", + "(Possibly internal error - ensure you have that time scale in data. ", + "If not, complain to the package maintainer.") + } + keepVars <- c(allScales, "lex.dur", "lex.id", "lex.Cst", + "lex.Xst", by, id, setdiff(names(ph), tmpHaz)) + keepVars <- unique(keepVars) + y <- subsetDTorDF(x, select = keepVars) + y <- setDT(copy(y)) + forceLexisDT(y, breaks = oldBreaks, allScales = allScales, key = TRUE) + + ## won't use statuses for anything + y[, c("lex.Cst", "lex.Xst") := NULL] + y[, c("lex.Cst", "lex.Xst") := 0L] + + if (immortal) { + ## set lex.dur to infinite. this assumes that the subjects never leave + ## follow-up (which Ederer I assumes) + storage.mode(y$lex.dur) <- "double" + id_last <- !duplicated(y, by = "lex.id", fromLast = TRUE) + y[TF$id_last, lex.dur := Inf] + } + + y <- intelliCrop(y, breaks = breaks, allScales = allScales) + y <- intelliDrop(y, breaks = breaks) + setDT(y) + forceLexisDT(y, breaks = oldBreaks, allScales = allScales, key = TRUE) + setkeyv(y, c(id, survScale)) + + y <- splitMulti(y, breaks = breaks, drop = FALSE, merge = TRUE) + + if (verbose) cat("Time taken by splitting: ", timetaken(pt), ".\n", sep = "") + + ## merge pop haz ------------------------------------------------------------- + pt <- proc.time() + mergeVars <- intersect(names(y), names(ph)) + mergeScales <- intersect(allScales, mergeVars) + if (length(mergeScales) == 0L) mergeScales <- NULL + mergeCats <- setdiff(mergeVars, mergeScales) + if (length(mergeCats) == 0L) mergeCats <- NULL + + y <- cutLowMerge(y, ph, by = mergeVars, + mid.scales = mergeScales, old.nums = TRUE, + all.x = TRUE, all.y = FALSE) + setDT(y) + if (verbose) cat("Time taken by merging pophaz: ", timetaken(pt), ".\n", sep = "") + + ## ederer I computation ------------------------------------------------------ + ## prep temp surv.int var + tmpSI <- makeTempVarName(x, pre = "surv_int_") + setkeyv(y, c(by, id, allScales[1L])) + y[, c(tmpSI) := cut(y[[survScale]], breaks[[survScale]], + right=FALSE,labels=FALSE)] + setkeyv(y, c(by, tmpSI)) + + ## EDERER I: integral of the weighted average expected hazard, + ## where the weights are the subject-specific expected survival + ## probabilities. + ## 1) compute integral of hazard over an interval t_i by id + ## (NOT cumulative hazard from zero till end of interval t_i) + ## This sums over multiple rows a subject may have within one + ## survival interval due to splitting by multiple time scales. + pt <- proc.time() + set(y, j = tmpHaz, value = y[[tmpHaz]]*y$lex.dur) + y <- y[, lapply(.SD, sum), keyby = eval(unique(c(by, id, tmpSI))), + .SDcols = c(tmpHaz)] + setnames(y, ncol(y), tmpHaz) + + ## reverse temp names - need to be able to refer to haz without temp var + ## to enable correct cumsum() below + avoid <- c(names(y), tmpHaz, "surv.exp") + tmpBy <- makeTempVarName(names = avoid, pre = by) + tmpID <- makeTempVarName(names = c(avoid, tmpBy), pre = id) + if (length(by)) { + setnames(y, by, tmpBy) + if (id %in% by) tmpID <- id <- tmpBy[by == id] + } else { + tmpBy <- by <- NULL + } + setnames(y, id, tmpID) + setnames(y, tmpHaz, "haz") + if (verbose) cat("Time taken by 1): ", timetaken(pt), ".\n", sep = "") + + ## 2) expected cum.haz. over intervals t_1 -> t_i by id... + ## (no cumulative exp.surv yet) + pt <- proc.time() + y[, surv.exp := cumsum(haz), by = eval(tmpID)] + + if (verbose) cat("Time taken by 2): ", timetaken(pt), ".\n", sep = "") + ## 3) cumulative surv.exp till end of interval t_i by id... + pt <- proc.time() + y[, surv.exp := exp(-surv.exp)] + + if (verbose) cat("Time taken by 3): ", timetaken(pt), ".\n", sep = "") + + ## 4) The Ederer I expected (marginal) survivals for intervals t_i + pt <- proc.time() + y <- y[, .(surv.exp = mean(surv.exp)), by = eval(c(tmpBy, tmpSI))] + if (verbose) cat("Time taken by 4): ", timetaken(pt), ".\n", sep = "") + + if ("surv.exp" %in% by) { + by[by == "surv.exp"] <- makeTempVarName(y, pre = "surv.exp") + } + if (length(by)) setnames(y, tmpBy, by) + + setcolorder(y, c(by, tmpSI, "surv.exp")) + setkeyv(y, c(by, tmpSI)) + setnames(y, tmpSI, survScale) + br <- breaks[[survScale]] + br <- br[-1] + y[, c(survScale) := br[y[[survScale]]]] + + + y[] +} + + + + +detectSurvivalTimeScale <- function(lex, values) { + + checkLexisData(lex) + + allScales <- attr(lex, "time.scales") + + allScVals <- lapply(allScales, function(ch) lex[[ch]]) + names(allScVals) <- allScales + + whSurvScale <- lapply(allScVals, function(col) { + identical(col, values) + }) + whSurvScale <- unlist(whSurvScale) + + if (sum(whSurvScale) == 0L) { + whSurvScale <- lapply(allScVals, function(col) { + isTRUE({ + all.equal(col, values, scale = 1L, + check.attributes = FALSE, + tolerance = .Machine$double.eps ^ 0.5) + }) + }) + whSurvScale <- unlist(whSurvScale) + } + if (sum(whSurvScale) == 0L) { + + dt <- as.data.table(allScVals) + dt <- cbind(dt, data.table(testValues = values)) + on.exit(print(dt)) + + stop("Could not determine which time scale was used. The formula MUST ", + "include the time scale used within a Surv() call (or a Surv object),", + " e.g. Surv(FUT, lex.Xst) ~ sex. Note that the 'time' argument is ", + "effectively (and exceptionally) used here to denote the times at ", + "the beginning of follow-up to identify the time scale existing in ", + "the supplied data to use. If you are sure you are mentioning a ", + "time scale in the formula in this manner, complain to the ", + "package maintainer. The table printed below contains the time ", + "scales tested against and the values that were supplied as the last ", + "column.") + } + survScale <- allScales[whSurvScale] + survScale + +} diff --git a/R/utility_functions.R b/R/utility_functions.R index 143973e..b0e39ab 100644 --- a/R/utility_functions.R +++ b/R/utility_functions.R @@ -1,1436 +1,1436 @@ - - -#' @title Cast \code{data.table}/\code{data.frame} from long format to wide format -#' @author Matti Rantanen, Joonas Miettinen -#' -#' @description -#' Convenience function for using \code{\link[data.table]{dcast.data.table}} -#' and \code{\link[reshape2]{dcast}}; -#' inputs are character strings (names of variables) instead of a formula. -#' -#' @param data a \code{data.table} or \code{data.frame} -#' @param columns a character string vector; the (unique combinations of the) -#' levels of these variable will be different rows -#' @param rows a character string vector; the (unique combinations of the) -#' levels of these variable will be different columns -#' @param values a character string; the variable which will be represented -#' on rows and columns as specified by \code{columns} and \code{rows} -#' @import data.table -#' @import stats -#' @export cast_simple -#' @details This function is just a small interface for \code{dcast} / -#' \code{dcast.data.table} and less flexible than the originals. -#' -#' Note that all \code{data.table} objects are also \code{data.frame} -#' objects, but that each have their own \code{dcast} method. -#' \code{\link[data.table]{dcast.data.table}} is faster. -#' -#' If any values in \code{value.vars} need to be -#' aggregated, they are aggregated using \code{sum}. -#' See \code{?dcast}. -#' -#' @examples -#' \dontrun{ -#' ## e.g. silly counts from a long-format table to a wide format -#' test <- copy(sire) -#' test$dg_y <- year(test$dg_date) -#' test$ex_y <- year(test$ex_date) -#' tab <- ltable(test, c("dg_y","ex_y")) -#' cast_simple(tab, columns='dg_y', rows="ex_y", values="obs") -#' } - - -cast_simple <- function(data=NULL, columns=NULL, rows=NULL, values=NULL) { - if (!is.data.frame(data)) stop("data needs be a data.frame or data.table") - if (is.null(data) || nrow(data) == 0L) stop("data NULL or has no rows") - - if (is.null(columns)) stop("columns cannot be NULL") - - msg <- paste0("Missing 'columns' variables: %%VARS%%") - all_names_present(data, columns, msg = msg) - msg <- paste0("Missing 'rows' variables: %%VARS%%") - all_names_present(data, rows, msg = msg) - msg <- paste0("Missing 'values' variables: %%VARS%%") - all_names_present(data, values, msg = msg) - - ## allow rows = NULL - rowsNULL <- FALSE - if (is.null(rows)) rowsNULL <- TRUE - if (rowsNULL) rows <- "1" - - ## sometimes rows names appear to be like expressions, e.g. 'factor(V1)' - ## (and this function only uses string-column-names, so that's fine.) - actualRows <- rows - if (length(rows) > 1L || rows != "1") { - rows <- makeTempVarName(names = c(names(data), columns), - pre = paste0("RN", 1:length(rows))) - on.exit(setnames(data, rows, actualRows), add = TRUE) - setnames(data, actualRows, rows) - } - ## same for cols - actualCols <- columns - columns <- makeTempVarName(names = c(names(data), rows), - pre = paste0("CN", 1:length(columns))) - on.exit(setnames(data, columns, actualCols), add = TRUE) - setnames(data, actualCols, columns) - - form <- paste0(paste0(rows, collapse = " + "), " ~ ", - paste0(columns, collapse = " + ")) - form <- as.formula(form) - - ## note: dcast probably usually finds the methods for data.frame / data.table, - ## but this method is more certain - if (is.data.table(data)) { - d <- dcast.data.table(data, formula = form, value.var=values, - drop=FALSE, fun.aggregate=sum)[] - } else { - d <- dcast(data, formula = form, value.var = values, - drop = FALSE, fun.aggregate = sum)[] - } - if (rowsNULL) set(d, j = names(d)[1L], value = NULL) - wh_rows <- which(rows %in% names(d)) - if (sum(wh_rows, na.rm = TRUE)) setnames(d, rows[wh_rows], actualRows[wh_rows]) - - d -} - - -#' @title Convert NA's to zero in data.table -#' @author Joonas Miettinen -#' @description Given a \code{data.table DT}, replaces any \code{NA} values -#' in the variables given in \code{vars} in \code{DT}. Takes a copy of the -#' original data and returns the modified copy. -#' @import data.table -#' @param DT \code{data.table} object -#' @param vars a character string vector of variables names in \code{DT}; -#' if \code{NULL}, uses all variable names in \code{DT} -#' @export na2zero -#' @details Given a \code{data.table} object, converts \code{NA} values -#' to numeric (double) zeros for all variables named in \code{vars} or -#' all variables if \code{vars = NULL}. -na2zero = function(DT, vars = NULL) { - if (!is.data.table(DT)) stop("DT must be a data.table") - DT <- copy(DT) - - navars <- vars - if (is.null(navars)) navars <- names(DT) - all_names_present(DT, navars) - for (k in navars) { - DT[is.na(get(k)), (k) := 0] - } - - return(DT[]) -} - - -#' @title Convert factor variable to numeric -#' @description Convert factor variable with numbers as levels into a numeric variable -#' @param x a factor variable with numbers as levels -#' @export fac2num -#' @details -#' For example, a factor with levels \code{c("5","7")} is converted into -#' a numeric variable with values \code{c(5,7)}. -#' @seealso -#' \code{\link{robust_values}} -#' @source -#' \href{http://stackoverflow.com/questions/3418128/how-to-convert-a-factor-to-an-integer-numeric-without-a-loss-of-information}{Stackoverflow thread} -#' @examples -#' ## this is often not intended -#' as.numeric(factor(c(5,7))) ## result: c(1,2) -#' ## but this -#' fac2num(factor(c(5,7))) ## result: c(5,7) -#' -#' ## however -#' as.numeric(factor(c("5","7","a"))) ## 1:3 -#' -#' fac2num(factor(c("5","7","a"))) ## result: c(5,7,NA) with warning -#' -#' -fac2num <- function(x) { - as.numeric(levels(x))[x] -} - - - - -#' @title Detect leap years -#' @author Joonas Miettinen -#' @description Given a vector or column of year values (numeric or integer), \code{\link{is_leap_year}} returns a vector of equal length -#' of logical indicators, i.e. a vector where corresponding leap years have value TRUE, and FALSE otherwise. -#' -#' @param years a vector or column of year values (numeric or integer) -#' @examples -#' ## can be used to assign new columns easily, e.g. a dummy indicator column -#' df <- data.frame(yrs=c(1900,1904,2005,1995)) -#' df$lyd <- as.integer(is_leap_year(df$yrs)) -#' -#' ## mostly it is useful as a condition or to indicate which rows have leap years -#' which(is_leap_year(df$yrs)) # 2 -#' df[is_leap_year(df$yrs),] # 2nd row -#' -#' @export is_leap_year -#' -is_leap_year <- function(years) { - if (!is.numeric(years)) { - stop("years must be a numeric vector, preferably integer for speed. Use e.g. as.integer().") - } - - years <- try2int(years) - if (!is.integer(years)) stop("years could not be coerced to integer; don't use fractional years such as 2000.1234 but integers such as 2000") - - # divisible by four - isLeap <- years %% 4L == 0L - # not divisible by 100 - isLeap <- isLeap & years %% 100L != 0L - # unless divisible by 400 also - isLeap <- isLeap | years %% 400L == 0L - isLeap - -} -#' @title Test if object is a \code{Date} object -#' @description Tests if an object is a \code{Date} object and returns -#' a logical vector of length 1. \code{IDate} objects are also -#' \code{Date} objects, but \code{date} objects from package \pkg{date} -#' are not. -#' @author Joonas Miettinen -#' @param obj object to test on -#' @export is.Date -#' @seealso -#' \code{\link{get.yrs}}, \code{\link{is_leap_year}}, \code{\link{as.Date}} -#' @examples -#' ## the base "capital Date" format -#' da <- as.Date("2000-01-01") -#' is.Date(da) ## TRUE -#' date::is.date(da) ## FALSE -#' -#' ## IDate format from data.table -#' library("data.table") -#' da <- as.IDate("2000-01-01") -#' is.Date(da) ## TRUE -#' date::is.date(da) ## FALSE -#' -#' ## from package "date" -#' da <- date::as.date("1jan2000") -#' is.Date(da) ## FALSE -#' date::is.date(da) ## TRUE -#' -is.Date <- function(obj) { - - if (any(c("IDate","Date") %in% class(obj))) { - return(TRUE) - } - - return(FALSE) -} - - -#' @title Convert values to numeric robustly -#' @author Joonas Miettinen -#' -#' @param num.values values to convert to numeric -#' @param force logical; if \code{TRUE}, returns a vector of values where values that cannot be interpreted as numeric are -#' set to \code{NA}; if \code{FALSE}, returns the original vector and gives a warning if any value cannot be interpreted as -#' numeric. -#' @param messages logical; if \code{TRUE}, returns a message of what was done with the \code{num.values} -#' @description Brute force solution for ensuring a variable is numeric by -#' coercing a variable of any type first to factor and then to numeric -#' @export robust_values -#' @import data.table -#' @note -#' Returns \code{NULL} if given \code{num.values} is \code{NULL}. -#' @examples -#' ## this works -#' values <- c("1", "3", "5") -#' values <- robust_values(values) -#' -#' ## this works -#' values <- c("1", "3", "5", NA) -#' values <- robust_values(values) -#' -#' ## this returns originals -#' values <- c("1", "3", "5", "a") -#' values <- robust_values(values) -#' -#' ## this forces "a" to NA and works otherwise -#' values <- c("1", "3", "5", "a") -#' values <- robust_values(values, force=TRUE) -#' - -robust_values <- function(num.values, force = FALSE, messages = TRUE) { - a <- NULL - if (is.null(num.values)) { - return(NULL) - } - dt <- data.table(num.values) - nas <- dt[is.na(num.values), .N] - - suppressWarnings( - dt[,a := fac2num(factor(num.values))] - ) - dt[, a := try2int(a)] - nas2 <- dt[is.na(a), .N] - - if (!force & nas2 > nas) { - if (messages) warning("since force = FALSE and NAs were created, returning original values") - return(dt$num.values) - } - if (force) { - if (nas2 > nas) { - if (messages) warning("some NAs were created") - } - return(dt$a) - } - - - return(dt$a) - - -} - -#' @title Check if all names are present in given data -#' @author Joonas Miettinen -#' @param data dataset where the variable names should be found -#' @param var.names a character vector of variable names, e.g. -#' \code{c("var1", "var2")} -#' @param stops logical, stop returns exception -#' @param msg Custom message to return instead of default message. -#' Special: include \code{\%\%VARS\%\%} in message string and the missing -#' variable names will be inserted there (quoted, separated by comma, e.g. -#' \code{'var1'}, \code{'var2'} --- no leading or tracing white space). -#' @description Given a character vector, checks if all names are present in \code{names(data)}. -#' Throws error if \code{stops=TRUE}, else returns \code{FALSE} if some variable name is not present. -#' @seealso -#' \code{\link{robust_values}} -#' @export all_names_present - -all_names_present <- function(data, var.names, stops = TRUE, msg = NULL) { - - if (!is.null(var.names) && !is.character(var.names)) { - stop("Argument 'var.names' must be NULL or a character vector of ", - "variable names.") - } - if (length(var.names) && any(is.na(var.names))) { - stop("There are ", sum(is.na(var.names)), " missing values in argument ", - "'var.names'. Please only supply non-NA values.") - } - - badNames <- setdiff(var.names, names(data)) - if (length(badNames) == 0L) return(TRUE) - - badNames <- paste0("'", badNames, "'", collapse = ", ") - - if (is.null(msg)) msg <- paste0("Cannot proceed - following given variable name(s) not present in dataset '", - deparse(substitute(data)), "': ", badNames) - if (!is.character(msg) || length(msg) > 1L) stop("Argument 'msg' must be a character string vector of length one.") else - msg <- gsub(pattern = "%%VARS%%", replacement = badNames, x = msg) - if (!is.logical(stops) || length(stops) > 1L) stop("Argument 'stops' must be either TRUE or FALSE.") - - if (stops) stop(msg) - - return(FALSE) -} - - -#' @title Return lower_bound value from char string (20,30] -#' @author Matti Rantanen -#' @description selects lowest values of each factor after cut() based -#' on that the value starts from index 2 and end in comma ",". -#' @param cut is a character vector of elements "(20,60]" -#' @export lower_bound - -lower_bound <- function(cut) { - cut <- as.character(cut) - ind <- gregexpr(pattern=',',cut) - ind <- as.numeric(ind) - 1 - t.sub <- as.numeric(substr(cut,2, ind)) - return(t.sub) -} - - -#' @title Change output values from cut(..., labels = NULL) output -#' @author Matti Rantanen -#' @param t is a character vector of elements, e.g. "(20,60]" -#' @param factor logical; TRUE returns informative character string, FALSE numeric (left value) -#' @description Selects lowest values of each factor after cut() based -#' on the assumption that the value starts from index 2 and end in comma ",". -#' @details type = 'factor': "[50,52)" -> "50-51" OR "[50,51)" -> "50" -#' -#' type = 'numeric': lowest bound in numeric. -#' -#' @export cut_bound -#' @examples -#' cut_bound("[1900, 1910)") ## "1900-1909" - -cut_bound <- function(t, factor=TRUE) { - if (!factor) { - t <- as.character(t) - ind <- gregexpr(pattern=',',t) - ind <- as.numeric(ind) - 1 - t <- as.numeric(substr(t,2, ind)) - return(t) - } - if (factor) { - t <- as.character(t) - t <- gsub(',', '-' , substr(t, 2, nchar(t) - 1) ) - ind <-as.numeric( gregexpr(pattern='-',t) ) - if (any(as.numeric( substr(t,1,ind-1) ) +1 == as.numeric( substr(t,ind+1,nchar(t))) ) ) { - t <- substr(t,1,ind-1) - return(t) - } - t - a <- substr(t, ind+1, nchar(t)) - t <- sub(a, as.character(as.numeric(a)-1), t) - return(t) - } -} - - - - -#' @title Set the class of an object (convenience function for -#' \code{setattr(obj, "class", CLASS)}); can add instead of replace -#' @description Sets the class of an object in place to \code{cl} -#' by replacing or adding -#' @param obj and object for which to set class -#' @param cl class to set -#' @param add if \code{TRUE}, adds \code{cl} to the -#' classes of the \code{obj}; otherwise replaces the class information -#' @param add.place \code{"first"} or \code{"last"}; adds \code{cl} -#' to the front or to the back of the \code{obj}'s class vector -#' @author Joonas Miettinen -setclass <- function(obj, cl, add=FALSE, add.place="first") { - match.arg(add.place, c("first","last")) - cl <- as.character(cl) - - if (add) { - old_classes <- attr(obj, "class") - - if (add.place=="first") { - setattr(obj, "class", c(cl, old_classes)) - } else { - setattr(obj, "class", c(old_classes, cl)) - } - } else { - setattr(obj, "class", cl) - } -} - - - - -#' @title Attempt coercion to integer -#' @author James Arnold -#' @description Attempts to convert a numeric object to integer, -#' but won't if loss of information is imminent (if values after decimal -#' are not zero for even one value in \code{obj}) -#' @param obj a numeric vector -#' @param tol tolerance; if each numeric value in \code{obj} deviate from -#' the corresponding integers at most the value of \code{tol}, they are considered -#' to be integers; e.g. by default \code{1 + .Machine$double.eps} is considered -#' to be an integer but \code{1 + .Machine$double.eps^0.49} is not. -#' @export try2int -#' @source \href{http://stackoverflow.com/questions/3476782/how-to-check-if-the-number-is-integer}{Stackoverflow thread} -try2int <- function(obj, tol = .Machine$double.eps^0.5) { - if (!is.numeric(obj)) stop("obj needs to be integer or double (numeric)") - if (is.integer(obj)) return(obj) - - test <- FALSE - - bad <- if (length(na.omit(obj)) == 0) TRUE else - min(obj, na.rm = TRUE) == -Inf || max(obj, na.rm = TRUE) == Inf - if (bad) { - return(obj) - } else { - test <- max(abs(obj) %% 1, na.rm = TRUE) < tol - } - - if (is.na(test) || is.null(test)) test <- FALSE - - if (test) return(as.integer(obj)) - - return(obj) - -} - - -#' @title Get rate and exact Poisson confidence intervals -#' @author epitools -#' @description Computes confidence intervals for Poisson rates -#' @param x observed -#' @param pt expected -#' @param conf.level alpha level -#' -#' @export poisson.ci -#' -#' -#' @examples -#' -#' poisson.ci(x = 4, pt = 5, conf.level = 0.95) -#' -poisson.ci <- function(x, pt = 1, conf.level = 0.95) { - xc <- cbind(x, conf.level, pt) - pt2 <- xc[, 3] - results <- matrix(NA, nrow(xc), 6) - f1 <- function(x, ans, alpha = alp) { - ppois(x, ans) - alpha/2 - } - f2 <- function(x, ans, alpha = alp) 1 - ppois(x, ans) + dpois(x, ans) - alpha/2 - for (i in 1:nrow(xc)) { - alp <- 1 - xc[i, 2] - interval <- c(0, xc[i, 1] * 5 + 4) - uci <- uniroot(f1, interval = interval, x = xc[i, 1])$root/pt2[i] - if (xc[i, 1] == 0) { - lci <- 0 - } - else { - lci <- uniroot(f2, interval = interval, x = xc[i,1])$root/pt2[i] - } - results[i, ] <- c(xc[i, 1], pt2[i], xc[i, 1]/pt2[i], lci, uci, xc[i, 2]) - } - coln <- c("x", "pt", "rate", "lower", "upper", "conf.level") - colnames(results) <- coln - data.frame(results) -} - - -#' @title Delete \code{data.table} columns if there -#' @author Joonas Miettinen -#' @description Deletes columns in a \code{data.table} conveniently. -#' May only delete columns that are found silently. Sometimes useful in e.g. -#' \code{on.exit} expressions. -#' @param DT a \code{data.table} -#' @param delete a character vector of column names to be deleted -#' @param keep a character vector of column names to keep; -#' the rest will be removed; \code{keep} overrides \code{delete} -#' @param colorder logical; if \code{TRUE}, also does \code{setcolorder} using -#' \code{keep} -#' @param soft logical; if \code{TRUE}, does not cause an error if any variable -#' name in \code{keep} or \code{delete} is missing; \code{soft = FALSE} useful -#' for programming sometimes -#' -#' -#' @export setcolsnull -setcolsnull <- function(DT=NULL, delete=NULL, keep=NULL, colorder=FALSE, soft=TRUE) { - if (!is.data.table(DT)) stop("not a data.table") - if (!soft) { - all_names_present(DT, keep, msg = "Expected") - all_names_present(DT, delete) - } - del_cols <- NULL - del_cols <- intersect(delete, names(DT)) - if (!is.null(keep)) { - del_cols <- setdiff(names(DT), keep) - } - if (length(del_cols) > 0) { - set(DT, j = (del_cols), value = NULL) - } - if (colorder) { - setcolorder(DT, intersect(keep, names(DT))) - } - return(invisible()) -} - - - - - - -#' @title Coerce a \code{ratetable} Object to Class \code{data.frame} -#' @description -#' \code{ratatable} objects used in e.g. \pkg{survival} and \pkg{relsurv} -#' can be conveniently coerced to a long-format \code{data.frame}. -#' However, the names and levels of variables in the result -#' may not match names and levels of variables in your data. -#' @author Joonas Miettinen -#' @param x a \code{ratetable} -#' @param ... unused but added for compatibility with \code{as.data.frame} -#' @examples -#' if (requireNamespace("relsurv", quietly = TRUE)) { -#' data(slopop, package = "relsurv") -#' df <- as.data.frame(slopop) -#' head(df) -#' } - -#' @seealso -#' \code{\link[survival]{ratetable}}, -#' \code{\link{as.data.table.ratetable}} -#' -#' @export -as.data.frame.ratetable <- function(x, ...) { - dimids <- attr(x, "dimid") - x <- as.data.frame.table(as.table(as.array(x))) - names(x) <- c(dimids, "haz") - x[] -} - - -#' @title Coerce a \code{ratetable} Object to Class \code{data.table} -#' @author Joonas Miettinen -#' -#' @description -#' \code{ratatable} objects used in e.g. \pkg{survival} and \pkg{relsurv} -#' can be conveniently coerced to a long-format \code{data.frame}. -#' However, the names and levels of variables in the result -#' may not match names and levels of variables in your data. -#' @param x a \code{ratetable} -#' @param ... other arguments passed on to \code{as.data.table} - -#' @seealso -#' \code{\link[survival]{ratetable}}, -#' \code{\link{as.data.frame.ratetable}} -#' -#' @examples -#' if (requireNamespace("relsurv", quietly = TRUE)) { -#' library("data.table") -#' data(slopop, package = "relsurv") -#' dt <- as.data.table(slopop) -#' dt -#' } - -#' @export -as.data.table.ratetable <- function(x, ...) { - dimids <- attr(x, "dimid") - x <- as.data.table(as.table(as.array(x)), ...) - x[, names(x) := lapply(.SD, robust_values, messages = FALSE, force = FALSE)] - setnames(x, c(dimids, "haz")) - x[] -} - - -#' @title \strong{Experimental}: Coerce a long-format \code{data.frame} to a \code{ratetable} object -#' @author Joonas Miettinen -#' @description Coerces a long-format \code{data.frame} of population hazards -#' to an array, and in turn to a \code{\link[survival]{ratetable}}, -#' which can be used in e.g. \pkg{survival}'s expected survival computations -#' and \pkg{relsurv}'s relative survival computations. -#' @param DF a \code{data.frame} -#' @param value.var name of values variable in quotes -#' @param by.vars names vector of variables by which to create (array) dimensions -#' @seealso -#' \code{\link[survival]{ratetable}}, -#' \code{\link{as.data.table.ratetable}}, -#' \code{\link{as.data.frame.ratetable}} -#' -longDF2ratetable <- function(DF, value.var = "haz", by.vars = setdiff(names(DF), value.var)) { - univals <- lapply(DF[, by.vars], unique) - names(univals) <- NULL - dimvec <- sapply(DF[,by.vars], function(x) {length(unique(x))}, - simplify=TRUE) - ar <- array(DF[, value.var], dim = dimvec) - dimnames(ar) <- univals - attr(ar, "class") <- "ratetable" - attr(ar, "dimid") <- colnames(DF) - ar -} - -temp_var_names <- function(n = 1L, avoid = NULL, length = 10L) { - ## INTENTION: make temporary variable names that don't exist in - ## char vector "avoid", e.g. avoid = names(data). - if (n < 1L || !is.integer(n)) { - stop("n must an integer > 0") - } - if (length < 1L || !is.integer(length)) { - stop("length must an integer > 0") - } - if (!is.null(avoid)) avoid <- as.character(avoid) - - pool <- c(0:9, letters, LETTERS) - - formTemp <- function(int) { - v <- sample(x = pool, size = length, replace = TRUE) - paste0(v, collapse = "") - } - - l <- lapply(1:n, formTemp) - dupll <- duplicated(l) | l %in% avoid - tick <- 1L - while (any(dupll) && tick <= 100L) { - l[dupll] <- lapply(1:sum(dupll), formTemp) - dupll <- duplicated(l) | l %in% avoid - tick <- tick + 1L - } - if (tick >= 100L) { - stop("ran randomization 100 times and could not create unique temporary", - " names. Perhaps increase length?") - } - unlist(l) -} - -#' @import stats -makeTempVarName <- function(data=NULL, names=NULL, - pre=NULL, post=NULL, length = 10L) { - DN <- NULL - DN <- c(DN, names(data)) - DN <- c(DN, names) - DN <- unique(DN) - - if (length(pre) != length(post) && length(post) > 0L && length(pre) > 0L) { - stop("Lengths of arguments 'pre' and 'post' differ (", length(pre), " vs. ", - length(post), "). (Tried to create temporary variables, so this is ", - "most likely an internal error and the pkg maintainer should be ", - "complained to.)") - } - useN <- max(length(pre), length(post), 1L) - useL <- length - tv <- temp_var_names(avoid = DN, n = useN, length = useL) - tv <- paste0(pre, tv, post) - tv -} - - -setDFpe <- function(x) { - ## intended to only be used to set data.table to data.frame in place - ## when option("popEpi.datatable") == FALSE - if (!is.data.table(x)) stop("only accepts data.table as input") - - cl <- class(x) - wh <- which(cl == "data.table") - cl = c(cl[1:(wh-1)], cl[(wh+1):length(cl)]) - setattr(x, "class", cl) - - setattr(x, "sorted", NULL) - setattr(x, ".internal.selfref", NULL) -} - - - -evalLogicalSubset <- function(data, substiset, n = 2, enclos = parent.frame(n)) { - ## NOTE: subset MUST be substitute()'d before using this function! - ## we allow substiset to be a logical condition only - ## ALWAYS returns a logical vector of length nrow(data) - - substiset <- eval(substiset, envir = data, enclos = enclos) - if (!is.null(substiset)) { - if (!is.logical(substiset)) stop("Expression to subset by must be a logical condition, e.g. var1 == 0, var1 %in% 1:2, var1 > 0, etc.") - substiset <- substiset & !is.na(substiset) - if (sum(substiset) == 0) stop("zero rows in data after subset") - } else { - substiset <- rep(TRUE, nrow(data)) - } - substiset -} - - -subsetDTorDF <- function(data, subset=NULL, select=NULL) { - ## INTENTION: subsetting either a data.table or a data.frame - ## and returning only selected variables for lazy people. - if (!is.data.frame(data)) stop("data must be a data.table/data.frame") - if (!is.logical(subset) && !is.null(subset)) stop("subset must be a logical vector or NULL") - - if (is.null(select)) { - select <- names(data) - } else { - all_names_present(data, select) - } - - e <- "data[" - if (!is.null(subset) && !all(subset)) e <- paste0(e, "subset") - if (!is.null(select) && (length(select) < names(data) || any(select != names(data)))) { - e <- paste0(e, ", eval(select)") - if (is.data.table(data)) e <- paste0(e, ", with = FALSE") - } - e <- paste0(e, "]") - - e <- parse(text = e) - - eval(e) - -} - -subsetRolling <- function(data, subset = NULL, select = NULL) { - ## INTENTION: subsets a data.table column by column and by deleting columns - ## in the old data.table. - if (!is.data.table(data)) stop("data must be a data.table") - if (!is.logical(subset)) stop("subset must be a logical vector") - - if (is.null(select)) { - select <- names(data) - } else { - all_names_present(data, select) - } - - if (length(select) == 0L) stop("select is of length zero, which would delete all columns in data") - - setcolsnull(data, keep = select) - - dt <- data[subset, select[1L], with = FALSE] - - setcolsnull(data, delete = select[1L]) - select <- select[-1L] - - for (v in select) { - set(dt, j = v, value = data[[v]][subset]) - set(data, j = v, value = NULL) - } - - rm(list = deparse(substitute(data)), envir = parent.frame(1L)) - - dt -} - - - -setDT2DF <- function(x) { - if (!is.data.table(x)) stop("only accepts data.table as input") - - cl <- class(x) - cl <- setdiff(cl, "data.table") - setattr(x, "class", cl) - setattr(x, "sorted", NULL) - setattr(x, ".internal.selfref", NULL) - invisible(x) -} - -setDF2DT <- function(x) { - if (!is.data.frame(x) || is.data.table(x)) stop("only accepts data.frame as input") - - cl <- class(x) - whDF <- which(cl == "data.frame") - cl <- c(cl[1:(whDF-1)], "data.table", "data.frame", cl[whDF:length(cl)]) - - setattr(x, "class", cl) - alloc.col(x) - - invisible(x) -} - - - - -p.round <- function(p, dec=3) { - th <- eval( parse(text=paste0('1E-', dec ) )) - if( is.null(p)) return( '= NA') - if( is.na(p)) return( '= NA') - if( p < th ){ - p <- paste0('< ', th ) - } else { - p <- paste0('= ', round(p, dec) ) - } - p -} - - -cutLow <- function(x, breaks, tol = .Machine$double.eps^0.5) { - ## a cut function that returns the lower bounds of the cut intervals (as numeric) as levels - - breaks <- sort(breaks) - x <- cut(x + tol, right = FALSE, breaks = breaks, labels = FALSE) - x <- breaks[-length(breaks)][x] - x -} - - - - -setcols <- function(x, j, value) { - ## intention: add new columns to DT via modifying in place, and to DF - ## via DF$var <- value; both conserve memory (don't take copy of whole data) - - if (!is.data.frame(x)) stop("x must be a data.frame") - if (!is.list(value)) stop("value must be a list of values (columns to add)") - if (missing(j)) j <- names(value) - - if (!is.data.table(x)) { - x[j] <- value - } else { - set(x, j = j, value = value) - } - x -} - - - - -cutLowMerge <- function(x, y, by.x = by, by.y = by, by = NULL, all.x = all, all.y = all, all = FALSE, mid.scales = c("per", "age"), old.nums = TRUE) { - ## INTENTION: merges y to x by by.x & by.y after cutLow()'ing appropriate - ## variables in x so that y's values match with x's values - ## requirements; - ## * numeric variables in y correspond to lower limits of some intervals OR - ## are group variables (e.g. sex = c(0,1)) - ## inputs: two datas as in merge, preferably both data.table, and other args - ## to merge() - ## output: a data.table where y has been merged to x after cutLow() - ## example: merging popmort to a split Lexis object, where popmort's variables - ## correspond to at least some Lexis time scales - ## old.nums: return old numeric variable values used in cutLow()'ing? - ## mid.scales: use mid-point of interval when merging by these Lexis time scales - ## computed by adding + 0.5*lex.dur, which must exist - - if (!is.data.table(x)) { - stop("x must be a data.table") - } - - if ((is.null(by.x) && !is.null(by.y)) || (!is.null(by.x) && is.null(by.y))) { - stop("one but not both of by.x / by.y is NULL") - } - if (!is.null(by)) by.x <- by.y <- by - - if (length(by.x) != length(by.y)) stop("lengths differ for by.y & by.x") - all_names_present(x, by.x) - all_names_present(y, by.y) - names(by.x) <- by.y - names(by.y) <- by.x - - if (length(mid.scales)>0) all_names_present(x, c("lex.dur", mid.scales)) - - whScale <- by.x %in% mid.scales - xScales <- by.x[whScale] - yScales <- by.y[whScale] - - if (length(yScales) > 0) { - - oldVals <- copy(with(x, mget(xScales))) - on.exit(set(x, j = xScales, value = oldVals)) - setattr(oldVals, "names", yScales) - - for (yVar in yScales) { - xVar <- xScales[yVar] - xBr <- sort(unique(y[[yVar]])) - xBr <- unique(c(xBr, Inf)) - set(x, j = xVar, value = cutLow(x[[xVar]] + x$lex.dur*0.5, breaks = xBr)) - } - - } - - ## ensure x retains order (no copy taken of it) - xKey <- key(x) - if (length(xKey) == 0) { - xKey <- makeTempVarName(x, pre = "sort_") - on.exit(if ("x" %in% ls()) setcolsnull(x, delete = xKey, soft = TRUE), add = TRUE) - on.exit(if ("z" %in% ls()) setcolsnull(z, delete = xKey, soft = TRUE), add = TRUE) - x[, (xKey) := 1:.N] - } - - if (any(duplicated(y, by = by.y))) { - stop("y is duplicated by the inferred/supplied by.y variables (", - paste0("'", by.y, "'", collapse = ", "), "). ", - "First ensure this is not so before proceeding.") - } - - ## avoid e.g. using merge.Lexis when x inherits Lexis - xClass <- class(x) - on.exit({ - setattr(x, "class", xClass) - }, add = TRUE) - setattr(x, "class", c("data.table", "data.frame")) - - ## return old numeric values of variables that were cutLow()'d - ## by keeping them - if (old.nums && length(xScales)) { - tmpXScales <- makeTempVarName(names = c(names(x), names(y)), pre = xScales) - set(x, j = tmpXScales, value = oldVals) - on.exit({ - xOrder <- setdiff(names(x), tmpXScales) - setcolsnull(x, delete = xScales, soft = TRUE) - setnames(x, tmpXScales, xScales) - setcolorder(x, xOrder) - - }, add = TRUE) - } - - ## merge - z <- merge(x, y, by.x = by.x, by.y = by.y, - all.x = all.x, all.y = all.y, all = all, - sort = FALSE) - - setDT(z) - if (old.nums && length(xScales)) { - ## avoid warning due to coercing double to integer - set(z, j = xScales, value = NULL) - setnames(z, tmpXScales, xScales) - } - - zOrder <- intersect(names(x), names(z)) - zOrder <- c(zOrder, setdiff(names(z), names(x))) - setcolorder(z, zOrder) - if (length(xKey) > 0) setkeyv(z, xKey) - z[] - -} - - -getOrigin <- function(x) { - ## input: Date, IDate, or date variable - ## output: the origin date in Date format, - ## the origin date being the date where the underlying index is zero. - if (inherits(x, "Date") || inherits(x, "IDate")) { - as.Date("1970-01-01") - } else if (inherits(x, "date")) { - as.Date("1960-01-01") - } else if (inherits(x, "dates")) { - as.Date(paste0(attr(x, "origin"), collapse = "-"), format = "%d-%m-%Y") - } else { - stop("class '", class(x), "' not supported; usage of Date recommended - see ?as.Date") - } - -} - -promptYN <- function(q) { - - rl <- readline(prompt = paste0(q, " (Y/N) ::: ")) - y <- c("y", "Y") - n <- c( "n", "N") - if (!rl %in% c(y,n)) { - cat("Answer must be one of the following (without ticks):", paste0("'",c(y, n),"'", collapse = ", ")) - promptYN(q = q) - } - - if (rl %in% y) TRUE else FALSE - -} - - - -oneWhitespace <- function(x) { - if (!is.character(x)) stop("x not a character") - x <- paste0(x, collapse = " ") - while(sum(grep(pattern = " ", x = x))) { - x <- gsub(pattern = " ", replacement = " ", x = x) - } - x -} - - -aliased_cols <- function(data, cols) { - - if (missing(cols)) cols <- names(data) - all_names_present(data, cols) - - if (length(cols) < 2L) return(invisible()) - - x <- with(data, mget(cols)) - x <- lapply(x, duplicated) - - sub_cols <- cols - tl <- list() - ## loop: each step reduce vector of names by one - ## to avoid testing the same variables twice (in both directions) - tick <- 0L - aliased <- FALSE - while (!aliased && length(sub_cols) > 1L && tick <= length(cols)) { - - currVar <- sub_cols[1L] - sub_cols <- setdiff(sub_cols, currVar) - tl[[currVar]] <- unlist(lapply(x[sub_cols], function(j) identical(x[[currVar]], j))) - aliased <- sum(tl[[currVar]]) - - tick <- tick + 1L - } - - if (tick == length(cols)) warning("while loop went over the number of columns argument cols") - - ## result: list of logical vectors indicating if a column is aliased - ## with other columns - tl[vapply(tl, function(j) sum(j) == 0L, logical(1))] <- NULL - - if (length(tl) == 0L) return(invisible()) - - ## take first vector for reporting - var <- names(tl)[1L] - aliases <- names(tl[[1L]])[tl[[1]]] - aliases <- paste0("'", aliases, "'", collapse = ", ") - stop("Variable '", var, "' is aliased with following variable(s): ", aliases, ".") - - invisible() -} - - - - - - - -return_DT <- function() { - - x <- getOption("popEpi.datatable") - if (!is.null(x) && !is.logical(x)) { - stop("the option 'popEpi.datatable' must be either NULL or a logical ", - "value (TRUE / FALSE).") - } - if (is.null(x) || isTRUE(x)) { - return(TRUE) - } - return(FALSE) - -} - - - - -#' @title Create a Lexis Object with Follow-up Time, Period, and Age -#' Time Scales -#' @description -#' This is a simple wrapper around \code{\link[Epi]{Lexis}} for creating -#' a \code{Lexis} object with the time scales \code{fot}, \code{per}, -#' and \code{age}. -#' @param data a \code{data.frame}; mandatory -#' @param birth the time of birth; A character string naming the variable in -#' data or an expression to evaluate - see -#' \link[=flexible_argument]{Flexible input} -#' @param entry the time at entry to follow-up; supplied the -#' same way as \code{birth} -#' @param exit the time at exit from follow-up; supplied the -#' same way as \code{birth} -#' @param entry.status passed on to \code{\link[Epi]{Lexis}} if not \code{NULL}; -#' supplied the same way as \code{birth} -#' @param exit.status passed on to \code{\link[Epi]{Lexis}} if not \code{NULL}; -#' supplied the same way as \code{birth} -#' @param subset a logical condition to subset by before passing data -#' and arguments to \code{\link[Epi]{Lexis}} -#' @param ... additional optional arguments passed on to -#' \code{\link[Epi]{Lexis}} -#' @return -#' A \code{Lexis} object with the usual columns that \code{Lexis} objects -#' have, with time scale columns \code{fot}, \code{per}, and \code{age}. -#' They are calculated as -#' -#' \code{fot = entry - entry} (to ensure correct format, e.g. difftime) -#' -#' \code{per = entry} -#' -#' and -#' -#' \code{age = entry - birth} -#' -#' @examples -#' -#' data("sire", package = "popEpi") -#' -#' lex <- Lexis_fpa(sire, -#' birth = "bi_date", -#' entry = dg_date, -#' exit = ex_date + 1L, -#' exit.status = "status") -#' -#' ## some special cases -#' myVar <- "bi_date" -#' l <- list(myVar = "bi_date") -#' sire$l <- sire$myVar <- 1 -#' -#' ## conflict: myVar taken from data when "bi_date" was intended -#' lex <- Lexis_fpa(sire, -#' birth = myVar, -#' entry = dg_date, -#' exit = ex_date + 1L, -#' exit.status = "status") -#' -#' ## no conflict with names in data -#' lex <- Lexis_fpa(sire, -#' birth = l$myVar, -#' entry = dg_date, -#' exit = ex_date + 1L, -#' exit.status = "status") -#' @export -Lexis_fpa <- function(data, - birth = NULL, - entry = NULL, - exit = NULL, - entry.status = NULL, - exit.status = NULL, - subset = NULL, - ...) { - if (!requireNamespace("Epi", quietly = TRUE)) { - stop("Install package Epi before using this function.") - } - TF <- environment() - PF <- parent.frame(1L) - - checkVars <- c("fot", "per", "age", - paste0("lex.", c("dur", "Xst", "Cst", "id"))) - checkVars <- intersect(names(data), checkVars) - if (length(checkVars)) { - stop("Following variable name(s) reserved but exist in data: ", - paste0(checkVars, collapse = ", ")) - } - - - sb <- substitute(subset) - subset <- evalLogicalSubset(data, sb, enclos = PF) - if (all(subset)) subset <- NULL - x <- subsetDTorDF(data = data, subset = subset) - setDT(x) - - an <- c("birth", "entry", "exit", "entry.status", "exit.status") - - l <- vector("list", length(an)) - names(l) <- an - for (stri in an) { - e <- paste0("substitute(", stri, ", env = TF)") - e <- parse(text = e)[[1]] - e <- eval(e, envir = TF) ## e.g. result of substitute(birth) - e <- evalPopArg(data = x, arg = e, enclos = PF)[[1]] - l[[stri]] <- e - } - - l[sapply(l, is.null)] <- NULL - - missVars <- setdiff(c("birth", "entry", "exit"), names(l)) - if (length(missVars)) { - stop("Following mandatory arguments were NULL: ", - paste0(missVars, collapse = ", ")) - } - - fot <- l$entry - l$entry - per <- l$entry - age <- l$entry - l$birth - per_exit <- l$exit - - en <- list(fot = fot, per = per, age = age) - ex <- list(per = per_exit) - - al <- list(entry = en, exit = ex, entry.status = l$entry.status, - exit.status = l$exit.status, data = x) - al[sapply(al, is.null)] <- NULL - - do.call(Epi::Lexis, args = c(al, ...)) -} - - - - - - - -get_breaks <- function(x) { - UseMethod("get_breaks") -} - -get_breaks.survtab <- function(x) { - - ss <- attributes(x)$survtab.meta$surv.scale - sb <- attributes(x)$survtab.meta$surv.breaks - - l <- list(sb) - names(l) <- ss - as.list(l) - -} - - -get_breaks.aggre <- function(x) { - - as.list(attributes(x)$aggre.meta$breaks) - -} - -get_breaks.Lexis <- function(x) { - as.list(attributes(x)$breaks) -} - -get_breaks.default <- function(x) { - NULL -} - - -select_breaks <- function(data, ...) { - UseMethod("select_breaks") -} - -select_breaks.default <- function(data, ts, br = NULL, ...) { - br <- do_select_breaks(data = data, ts = ts, br = br) - if (is.null(br)) { - stop("Data did not contain breaks and no breaks were supplied ", - "by hand.") - } - br -} - -select_breaks.aggre <- function(data, ts, br = NULL, ...) { - - - br <- do_select_breaks(data = data, ts = ts, br = br) - - select_breaks_subcheck(br, get_breaks(data)[[ts]], - "Manually supplied breaks were not a ", - "subset of the breaks in aggre data. ", - "Data has breaks as a result of being split and ", - "aggregated; see ?as.aggre and ?aggre") - - if (is.null(br)) { - stop("aggre object did not contain breaks and no breaks were supplied ", - "by hand.") - } - - br -} - -select_breaks.Lexis <- function(data, ts, br = NULL, ...) { - - checkLexisData(data) - - br <- do_select_breaks(data = data, ts = ts, br = br) - - select_breaks_subcheck(br, get_breaks(data)[[ts]], - "Manually supplied breaks were not a ", - "subset of the breaks in Lexis data. ", - "Data has breaks as a result of being a split Lexis ", - "object; see ?Lexis and e.g. ?splitMulti") - - if (is.null(br)) { - stop("Lexis object did not contain breaks and no breaks were supplied ", - "by hand.") - } - bl <- list(br) - names(bl) <- ts - checkBreaksList(data, breaks = bl) - - br -} - - -select_breaks_subcheck <- function(b1, b2, ...) { - l1 <- list(b1) - l2 <- list(b2) - names(l1) <- names(l2) <- "TS" - - if (!is.null(b1) && !is.null(b2) && !all_breaks_in(l1, l2)) { - stop(...) - } -} - -do_select_breaks <- function(data, ts, br = NULL) { - # @description selects breaks from data or from br depending on - # which one is NULL. If both exist, br must be a subset of the breaks - # in data. - - stopifnot(is.data.frame(data)) - stopifnot(is.character(ts) && length(ts) == 1L && ts %in% names(data)) - - dbr <- get_breaks(data)[[ts]] - - dbl <- list(dbr) - bl <- list(br) - names(dbl) <- names(bl) <- "TS" - - - - if (is.null(br)) br <- dbr - - br -} - - - - -breaks_in_data <- function(br, ts, data) { - ## note: last break does not usually appear in data, unless intentionally - ## limiting from e.g. 0:5 to 0:4 - stopifnot(length(ts) == 1 && ts %in% names(data)) - u <- unique(data[[ts]]) - - br <- sort(unique(br)) - if (length(br)<2) stop("There must be at least two breaks to form intervals") - - br <- if (max(br) <= max(u)) br else br[-length(br)] - all(br %in% u) - -} - - - - - -is_named_list <- function(x) is.list(x) && length(unique(names(x))) == length(x) - - - - -fuse_breakslists <- function(bl.old, bl.new, drop) { - # @description given two lists of breaks, uses all timescales found - # in both lists to fuse into one list. For common timescales an - # interval-based subset is taken, so that the new always limits the old - # when drop = TRUE. - - stopifnot( - is_named_list(bl.old), is_named_list(bl.new) - ) - - bl <- bl.old - new_scales <- setdiff(names(bl.old), names(bl.new)) - if (length(new_scales)) { - bl[new_scales] <- bl.new[new_scales] - } - common_scales <- intersect(names(bl.old), names(bl.new)) - if (length(common_scales)) { - - bl[common_scales] <- lapply(common_scales, function(time_scale) { - new <- bl.new[[time_scale]] - old <- bl.old[time_scale] - fuse <- sort(union(old, new)) - if (drop) { - r.new <- range(new) - r.old <- range(old) - r <- c(max(r.new[1], r.old[1]), min(r.new[2], r.old[2])) - fuse <- fuse[between(fuse, r[1], r[2], incbounds = TRUE)] - } - fuse - }) - - } - - bl - -} - - - - - - -set2 <- function(x, j, ...) { - cols_exst <- intersect(names(x), j) - old_order <- copy(names(x)) - if (length(cols_exst)) { - set(x, j = cols_exst, value = NULL) - } - set(x = x, j = j, ...) - new_cols <- setdiff(names(x), old_order) - setcolorder(x, c(old_order, new_cols)) - invisible(x) -} - - - - - -mget_cols <- function(cols, data) { - - stopifnot(all(cols %in% names(data))) - - setDT(mget(x = cols, envir = as.environment(data), inherits = FALSE)) -} - - - - - -get_random_seed <- function() { - t <- Sys.time() - s <- as.numeric(t) %% as.integer(t) - nc <- nchar(s) - s <- as.integer(substr(s, nc-8, nc)) - s -} - - - - - -skip_usually <- function() { - requireNamespace("testthat") - testthat::skip_on_cran() - testthat::skip_on_travis() - testthat::skip_on_appveyor() -} - - - - - - - - - - - - + + +#' @title Cast \code{data.table}/\code{data.frame} from long format to wide format +#' @author Matti Rantanen, Joonas Miettinen +#' +#' @description +#' Convenience function for using \code{\link[data.table]{dcast.data.table}} +#' and \code{\link[reshape2]{dcast}}; +#' inputs are character strings (names of variables) instead of a formula. +#' +#' @param data a \code{data.table} or \code{data.frame} +#' @param columns a character string vector; the (unique combinations of the) +#' levels of these variable will be different rows +#' @param rows a character string vector; the (unique combinations of the) +#' levels of these variable will be different columns +#' @param values a character string; the variable which will be represented +#' on rows and columns as specified by \code{columns} and \code{rows} +#' @import data.table +#' @import stats +#' @export cast_simple +#' @details This function is just a small interface for \code{dcast} / +#' \code{dcast.data.table} and less flexible than the originals. +#' +#' Note that all \code{data.table} objects are also \code{data.frame} +#' objects, but that each have their own \code{dcast} method. +#' \code{\link[data.table]{dcast.data.table}} is faster. +#' +#' If any values in \code{value.vars} need to be +#' aggregated, they are aggregated using \code{sum}. +#' See \code{?dcast}. +#' +#' @examples +#' \dontrun{ +#' ## e.g. silly counts from a long-format table to a wide format +#' test <- copy(sire) +#' test$dg_y <- year(test$dg_date) +#' test$ex_y <- year(test$ex_date) +#' tab <- ltable(test, c("dg_y","ex_y")) +#' cast_simple(tab, columns='dg_y', rows="ex_y", values="obs") +#' } + + +cast_simple <- function(data=NULL, columns=NULL, rows=NULL, values=NULL) { + if (!is.data.frame(data)) stop("data needs be a data.frame or data.table") + if (is.null(data) || nrow(data) == 0L) stop("data NULL or has no rows") + + if (is.null(columns)) stop("columns cannot be NULL") + + msg <- paste0("Missing 'columns' variables: %%VARS%%") + all_names_present(data, columns, msg = msg) + msg <- paste0("Missing 'rows' variables: %%VARS%%") + all_names_present(data, rows, msg = msg) + msg <- paste0("Missing 'values' variables: %%VARS%%") + all_names_present(data, values, msg = msg) + + ## allow rows = NULL + rowsNULL <- FALSE + if (is.null(rows)) rowsNULL <- TRUE + if (rowsNULL) rows <- "1" + + ## sometimes rows names appear to be like expressions, e.g. 'factor(V1)' + ## (and this function only uses string-column-names, so that's fine.) + actualRows <- rows + if (length(rows) > 1L || rows != "1") { + rows <- makeTempVarName(names = c(names(data), columns), + pre = paste0("RN", 1:length(rows))) + on.exit(setnames(data, rows, actualRows), add = TRUE) + setnames(data, actualRows, rows) + } + ## same for cols + actualCols <- columns + columns <- makeTempVarName(names = c(names(data), rows), + pre = paste0("CN", 1:length(columns))) + on.exit(setnames(data, columns, actualCols), add = TRUE) + setnames(data, actualCols, columns) + + form <- paste0(paste0(rows, collapse = " + "), " ~ ", + paste0(columns, collapse = " + ")) + form <- as.formula(form) + + ## note: dcast probably usually finds the methods for data.frame / data.table, + ## but this method is more certain + if (is.data.table(data)) { + d <- dcast.data.table(data, formula = form, value.var=values, + drop=FALSE, fun.aggregate=sum)[] + } else { + d <- dcast(data, formula = form, value.var = values, + drop = FALSE, fun.aggregate = sum)[] + } + if (rowsNULL) set(d, j = names(d)[1L], value = NULL) + wh_rows <- which(rows %in% names(d)) + if (sum(wh_rows, na.rm = TRUE)) setnames(d, rows[wh_rows], actualRows[wh_rows]) + + d +} + + +#' @title Convert NA's to zero in data.table +#' @author Joonas Miettinen +#' @description Given a \code{data.table DT}, replaces any \code{NA} values +#' in the variables given in \code{vars} in \code{DT}. Takes a copy of the +#' original data and returns the modified copy. +#' @import data.table +#' @param DT \code{data.table} object +#' @param vars a character string vector of variables names in \code{DT}; +#' if \code{NULL}, uses all variable names in \code{DT} +#' @export na2zero +#' @details Given a \code{data.table} object, converts \code{NA} values +#' to numeric (double) zeros for all variables named in \code{vars} or +#' all variables if \code{vars = NULL}. +na2zero = function(DT, vars = NULL) { + if (!is.data.table(DT)) stop("DT must be a data.table") + DT <- copy(DT) + + navars <- vars + if (is.null(navars)) navars <- names(DT) + all_names_present(DT, navars) + for (k in navars) { + DT[is.na(get(k)), (k) := 0] + } + + return(DT[]) +} + + +#' @title Convert factor variable to numeric +#' @description Convert factor variable with numbers as levels into a numeric variable +#' @param x a factor variable with numbers as levels +#' @export fac2num +#' @details +#' For example, a factor with levels \code{c("5","7")} is converted into +#' a numeric variable with values \code{c(5,7)}. +#' @seealso +#' \code{\link{robust_values}} +#' @source +#' \href{http://stackoverflow.com/questions/3418128/how-to-convert-a-factor-to-an-integer-numeric-without-a-loss-of-information}{Stackoverflow thread} +#' @examples +#' ## this is often not intended +#' as.numeric(factor(c(5,7))) ## result: c(1,2) +#' ## but this +#' fac2num(factor(c(5,7))) ## result: c(5,7) +#' +#' ## however +#' as.numeric(factor(c("5","7","a"))) ## 1:3 +#' +#' fac2num(factor(c("5","7","a"))) ## result: c(5,7,NA) with warning +#' +#' +fac2num <- function(x) { + as.numeric(levels(x))[x] +} + + + + +#' @title Detect leap years +#' @author Joonas Miettinen +#' @description Given a vector or column of year values (numeric or integer), \code{\link{is_leap_year}} returns a vector of equal length +#' of logical indicators, i.e. a vector where corresponding leap years have value TRUE, and FALSE otherwise. +#' +#' @param years a vector or column of year values (numeric or integer) +#' @examples +#' ## can be used to assign new columns easily, e.g. a dummy indicator column +#' df <- data.frame(yrs=c(1900,1904,2005,1995)) +#' df$lyd <- as.integer(is_leap_year(df$yrs)) +#' +#' ## mostly it is useful as a condition or to indicate which rows have leap years +#' which(is_leap_year(df$yrs)) # 2 +#' df[is_leap_year(df$yrs),] # 2nd row +#' +#' @export is_leap_year +#' +is_leap_year <- function(years) { + if (!is.numeric(years)) { + stop("years must be a numeric vector, preferably integer for speed. Use e.g. as.integer().") + } + + years <- try2int(years) + if (!is.integer(years)) stop("years could not be coerced to integer; don't use fractional years such as 2000.1234 but integers such as 2000") + + # divisible by four + isLeap <- years %% 4L == 0L + # not divisible by 100 + isLeap <- isLeap & years %% 100L != 0L + # unless divisible by 400 also + isLeap <- isLeap | years %% 400L == 0L + isLeap + +} +#' @title Test if object is a \code{Date} object +#' @description Tests if an object is a \code{Date} object and returns +#' a logical vector of length 1. \code{IDate} objects are also +#' \code{Date} objects, but \code{date} objects from package \pkg{date} +#' are not. +#' @author Joonas Miettinen +#' @param obj object to test on +#' @export is.Date +#' @seealso +#' \code{\link{get.yrs}}, \code{\link{is_leap_year}}, \code{\link{as.Date}} +#' @examples +#' ## the base "capital Date" format +#' da <- as.Date("2000-01-01") +#' is.Date(da) ## TRUE +#' date::is.date(da) ## FALSE +#' +#' ## IDate format from data.table +#' library("data.table") +#' da <- as.IDate("2000-01-01") +#' is.Date(da) ## TRUE +#' date::is.date(da) ## FALSE +#' +#' ## from package "date" +#' da <- date::as.date("1jan2000") +#' is.Date(da) ## FALSE +#' date::is.date(da) ## TRUE +#' +is.Date <- function(obj) { + + if (any(c("IDate","Date") %in% class(obj))) { + return(TRUE) + } + + return(FALSE) +} + + +#' @title Convert values to numeric robustly +#' @author Joonas Miettinen +#' +#' @param num.values values to convert to numeric +#' @param force logical; if \code{TRUE}, returns a vector of values where values that cannot be interpreted as numeric are +#' set to \code{NA}; if \code{FALSE}, returns the original vector and gives a warning if any value cannot be interpreted as +#' numeric. +#' @param messages logical; if \code{TRUE}, returns a message of what was done with the \code{num.values} +#' @description Brute force solution for ensuring a variable is numeric by +#' coercing a variable of any type first to factor and then to numeric +#' @export robust_values +#' @import data.table +#' @note +#' Returns \code{NULL} if given \code{num.values} is \code{NULL}. +#' @examples +#' ## this works +#' values <- c("1", "3", "5") +#' values <- robust_values(values) +#' +#' ## this works +#' values <- c("1", "3", "5", NA) +#' values <- robust_values(values) +#' +#' ## this returns originals +#' values <- c("1", "3", "5", "a") +#' values <- robust_values(values) +#' +#' ## this forces "a" to NA and works otherwise +#' values <- c("1", "3", "5", "a") +#' values <- robust_values(values, force=TRUE) +#' + +robust_values <- function(num.values, force = FALSE, messages = TRUE) { + a <- NULL + if (is.null(num.values)) { + return(NULL) + } + dt <- data.table(num.values) + nas <- dt[is.na(num.values), .N] + + suppressWarnings( + dt[,a := fac2num(factor(num.values))] + ) + dt[, a := try2int(a)] + nas2 <- dt[is.na(a), .N] + + if (!force & nas2 > nas) { + if (messages) warning("since force = FALSE and NAs were created, returning original values") + return(dt$num.values) + } + if (force) { + if (nas2 > nas) { + if (messages) warning("some NAs were created") + } + return(dt$a) + } + + + return(dt$a) + + +} + +#' @title Check if all names are present in given data +#' @author Joonas Miettinen +#' @param data dataset where the variable names should be found +#' @param var.names a character vector of variable names, e.g. +#' \code{c("var1", "var2")} +#' @param stops logical, stop returns exception +#' @param msg Custom message to return instead of default message. +#' Special: include \code{\%\%VARS\%\%} in message string and the missing +#' variable names will be inserted there (quoted, separated by comma, e.g. +#' \code{'var1'}, \code{'var2'} --- no leading or tracing white space). +#' @description Given a character vector, checks if all names are present in \code{names(data)}. +#' Throws error if \code{stops=TRUE}, else returns \code{FALSE} if some variable name is not present. +#' @seealso +#' \code{\link{robust_values}} +#' @export all_names_present + +all_names_present <- function(data, var.names, stops = TRUE, msg = NULL) { + + if (!is.null(var.names) && !is.character(var.names)) { + stop("Argument 'var.names' must be NULL or a character vector of ", + "variable names.") + } + if (length(var.names) && any(is.na(var.names))) { + stop("There are ", sum(is.na(var.names)), " missing values in argument ", + "'var.names'. Please only supply non-NA values.") + } + + badNames <- setdiff(var.names, names(data)) + if (length(badNames) == 0L) return(TRUE) + + badNames <- paste0("'", badNames, "'", collapse = ", ") + + if (is.null(msg)) msg <- paste0("Cannot proceed - following given variable name(s) not present in dataset '", + deparse(substitute(data)), "': ", badNames) + if (!is.character(msg) || length(msg) > 1L) stop("Argument 'msg' must be a character string vector of length one.") else + msg <- gsub(pattern = "%%VARS%%", replacement = badNames, x = msg) + if (!is.logical(stops) || length(stops) > 1L) stop("Argument 'stops' must be either TRUE or FALSE.") + + if (stops) stop(msg) + + return(FALSE) +} + + +#' @title Return lower_bound value from char string (20,30] +#' @author Matti Rantanen +#' @description selects lowest values of each factor after cut() based +#' on that the value starts from index 2 and end in comma ",". +#' @param cut is a character vector of elements "(20,60]" +#' @export lower_bound + +lower_bound <- function(cut) { + cut <- as.character(cut) + ind <- gregexpr(pattern=',',cut) + ind <- as.numeric(ind) - 1 + t.sub <- as.numeric(substr(cut,2, ind)) + return(t.sub) +} + + +#' @title Change output values from cut(..., labels = NULL) output +#' @author Matti Rantanen +#' @param t is a character vector of elements, e.g. "(20,60]" +#' @param factor logical; TRUE returns informative character string, FALSE numeric (left value) +#' @description Selects lowest values of each factor after cut() based +#' on the assumption that the value starts from index 2 and end in comma ",". +#' @details type = 'factor': "[50,52)" -> "50-51" OR "[50,51)" -> "50" +#' +#' type = 'numeric': lowest bound in numeric. +#' +#' @export cut_bound +#' @examples +#' cut_bound("[1900, 1910)") ## "1900-1909" + +cut_bound <- function(t, factor=TRUE) { + if (!factor) { + t <- as.character(t) + ind <- gregexpr(pattern=',',t) + ind <- as.numeric(ind) - 1 + t <- as.numeric(substr(t,2, ind)) + return(t) + } + if (factor) { + t <- as.character(t) + t <- gsub(',', '-' , substr(t, 2, nchar(t) - 1) ) + ind <-as.numeric( gregexpr(pattern='-',t) ) + if (any(as.numeric( substr(t,1,ind-1) ) +1 == as.numeric( substr(t,ind+1,nchar(t))) ) ) { + t <- substr(t,1,ind-1) + return(t) + } + t + a <- substr(t, ind+1, nchar(t)) + t <- sub(a, as.character(as.numeric(a)-1), t) + return(t) + } +} + + + + +#' @title Set the class of an object (convenience function for +#' \code{setattr(obj, "class", CLASS)}); can add instead of replace +#' @description Sets the class of an object in place to \code{cl} +#' by replacing or adding +#' @param obj and object for which to set class +#' @param cl class to set +#' @param add if \code{TRUE}, adds \code{cl} to the +#' classes of the \code{obj}; otherwise replaces the class information +#' @param add.place \code{"first"} or \code{"last"}; adds \code{cl} +#' to the front or to the back of the \code{obj}'s class vector +#' @author Joonas Miettinen +setclass <- function(obj, cl, add=FALSE, add.place="first") { + match.arg(add.place, c("first","last")) + cl <- as.character(cl) + + if (add) { + old_classes <- attr(obj, "class") + + if (add.place=="first") { + setattr(obj, "class", c(cl, old_classes)) + } else { + setattr(obj, "class", c(old_classes, cl)) + } + } else { + setattr(obj, "class", cl) + } +} + + + + +#' @title Attempt coercion to integer +#' @author James Arnold +#' @description Attempts to convert a numeric object to integer, +#' but won't if loss of information is imminent (if values after decimal +#' are not zero for even one value in \code{obj}) +#' @param obj a numeric vector +#' @param tol tolerance; if each numeric value in \code{obj} deviate from +#' the corresponding integers at most the value of \code{tol}, they are considered +#' to be integers; e.g. by default \code{1 + .Machine$double.eps} is considered +#' to be an integer but \code{1 + .Machine$double.eps^0.49} is not. +#' @export try2int +#' @source \href{http://stackoverflow.com/questions/3476782/how-to-check-if-the-number-is-integer}{Stackoverflow thread} +try2int <- function(obj, tol = .Machine$double.eps^0.5) { + if (!is.numeric(obj)) stop("obj needs to be integer or double (numeric)") + if (is.integer(obj)) return(obj) + + test <- FALSE + + bad <- if (length(na.omit(obj)) == 0) TRUE else + min(obj, na.rm = TRUE) == -Inf || max(obj, na.rm = TRUE) == Inf + if (bad) { + return(obj) + } else { + test <- max(abs(obj) %% 1, na.rm = TRUE) < tol + } + + if (is.na(test) || is.null(test)) test <- FALSE + + if (test) return(as.integer(obj)) + + return(obj) + +} + + +#' @title Get rate and exact Poisson confidence intervals +#' @author epitools +#' @description Computes confidence intervals for Poisson rates +#' @param x observed +#' @param pt expected +#' @param conf.level alpha level +#' +#' @export poisson.ci +#' +#' +#' @examples +#' +#' poisson.ci(x = 4, pt = 5, conf.level = 0.95) +#' +poisson.ci <- function(x, pt = 1, conf.level = 0.95) { + xc <- cbind(x, conf.level, pt) + pt2 <- xc[, 3] + results <- matrix(NA, nrow(xc), 6) + f1 <- function(x, ans, alpha = alp) { + ppois(x, ans) - alpha/2 + } + f2 <- function(x, ans, alpha = alp) 1 - ppois(x, ans) + dpois(x, ans) - alpha/2 + for (i in 1:nrow(xc)) { + alp <- 1 - xc[i, 2] + interval <- c(0, xc[i, 1] * 5 + 4) + uci <- uniroot(f1, interval = interval, x = xc[i, 1])$root/pt2[i] + if (xc[i, 1] == 0) { + lci <- 0 + } + else { + lci <- uniroot(f2, interval = interval, x = xc[i,1])$root/pt2[i] + } + results[i, ] <- c(xc[i, 1], pt2[i], xc[i, 1]/pt2[i], lci, uci, xc[i, 2]) + } + coln <- c("x", "pt", "rate", "lower", "upper", "conf.level") + colnames(results) <- coln + data.frame(results) +} + + +#' @title Delete \code{data.table} columns if there +#' @author Joonas Miettinen +#' @description Deletes columns in a \code{data.table} conveniently. +#' May only delete columns that are found silently. Sometimes useful in e.g. +#' \code{on.exit} expressions. +#' @param DT a \code{data.table} +#' @param delete a character vector of column names to be deleted +#' @param keep a character vector of column names to keep; +#' the rest will be removed; \code{keep} overrides \code{delete} +#' @param colorder logical; if \code{TRUE}, also does \code{setcolorder} using +#' \code{keep} +#' @param soft logical; if \code{TRUE}, does not cause an error if any variable +#' name in \code{keep} or \code{delete} is missing; \code{soft = FALSE} useful +#' for programming sometimes +#' +#' +#' @export setcolsnull +setcolsnull <- function(DT=NULL, delete=NULL, keep=NULL, colorder=FALSE, soft=TRUE) { + if (!is.data.table(DT)) stop("not a data.table") + if (!soft) { + all_names_present(DT, keep, msg = "Expected") + all_names_present(DT, delete) + } + del_cols <- NULL + del_cols <- intersect(delete, names(DT)) + if (!is.null(keep)) { + del_cols <- setdiff(names(DT), keep) + } + if (length(del_cols) > 0) { + set(DT, j = (del_cols), value = NULL) + } + if (colorder) { + setcolorder(DT, intersect(keep, names(DT))) + } + return(invisible()) +} + + + + + + +#' @title Coerce a \code{ratetable} Object to Class \code{data.frame} +#' @description +#' \code{ratatable} objects used in e.g. \pkg{survival} and \pkg{relsurv} +#' can be conveniently coerced to a long-format \code{data.frame}. +#' However, the names and levels of variables in the result +#' may not match names and levels of variables in your data. +#' @author Joonas Miettinen +#' @param x a \code{ratetable} +#' @param ... unused but added for compatibility with \code{as.data.frame} +#' @examples +#' if (requireNamespace("relsurv", quietly = TRUE)) { +#' data(slopop, package = "relsurv") +#' df <- as.data.frame(slopop) +#' head(df) +#' } + +#' @seealso +#' \code{\link[survival]{ratetable}}, +#' \code{\link{as.data.table.ratetable}} +#' +#' @export +as.data.frame.ratetable <- function(x, ...) { + dimids <- attr(x, "dimid") + x <- as.data.frame.table(as.table(as.array(x))) + names(x) <- c(dimids, "haz") + x[] +} + + +#' @title Coerce a \code{ratetable} Object to Class \code{data.table} +#' @author Joonas Miettinen +#' +#' @description +#' \code{ratatable} objects used in e.g. \pkg{survival} and \pkg{relsurv} +#' can be conveniently coerced to a long-format \code{data.frame}. +#' However, the names and levels of variables in the result +#' may not match names and levels of variables in your data. +#' @param x a \code{ratetable} +#' @param ... other arguments passed on to \code{as.data.table} + +#' @seealso +#' \code{\link[survival]{ratetable}}, +#' \code{\link{as.data.frame.ratetable}} +#' +#' @examples +#' if (requireNamespace("relsurv", quietly = TRUE)) { +#' library("data.table") +#' data(slopop, package = "relsurv") +#' dt <- as.data.table(slopop) +#' dt +#' } + +#' @export +as.data.table.ratetable <- function(x, ...) { + dimids <- attr(x, "dimid") + x <- as.data.table(as.table(as.array(x)), ...) + x[, names(x) := lapply(.SD, robust_values, messages = FALSE, force = FALSE)] + setnames(x, c(dimids, "haz")) + x[] +} + + +#' @title \strong{Experimental}: Coerce a long-format \code{data.frame} to a \code{ratetable} object +#' @author Joonas Miettinen +#' @description Coerces a long-format \code{data.frame} of population hazards +#' to an array, and in turn to a \code{\link[survival]{ratetable}}, +#' which can be used in e.g. \pkg{survival}'s expected survival computations +#' and \pkg{relsurv}'s relative survival computations. +#' @param DF a \code{data.frame} +#' @param value.var name of values variable in quotes +#' @param by.vars names vector of variables by which to create (array) dimensions +#' @seealso +#' \code{\link[survival]{ratetable}}, +#' \code{\link{as.data.table.ratetable}}, +#' \code{\link{as.data.frame.ratetable}} +#' +longDF2ratetable <- function(DF, value.var = "haz", by.vars = setdiff(names(DF), value.var)) { + univals <- lapply(DF[, by.vars], unique) + names(univals) <- NULL + dimvec <- sapply(DF[,by.vars], function(x) {length(unique(x))}, + simplify=TRUE) + ar <- array(DF[, value.var], dim = dimvec) + dimnames(ar) <- univals + attr(ar, "class") <- "ratetable" + attr(ar, "dimid") <- colnames(DF) + ar +} + +temp_var_names <- function(n = 1L, avoid = NULL, length = 10L) { + ## INTENTION: make temporary variable names that don't exist in + ## char vector "avoid", e.g. avoid = names(data). + if (n < 1L || !is.integer(n)) { + stop("n must an integer > 0") + } + if (length < 1L || !is.integer(length)) { + stop("length must an integer > 0") + } + if (!is.null(avoid)) avoid <- as.character(avoid) + + pool <- c(0:9, letters, LETTERS) + + formTemp <- function(int) { + v <- sample(x = pool, size = length, replace = TRUE) + paste0(v, collapse = "") + } + + l <- lapply(1:n, formTemp) + dupll <- duplicated(l) | l %in% avoid + tick <- 1L + while (any(dupll) && tick <= 100L) { + l[dupll] <- lapply(1:sum(dupll), formTemp) + dupll <- duplicated(l) | l %in% avoid + tick <- tick + 1L + } + if (tick >= 100L) { + stop("ran randomization 100 times and could not create unique temporary", + " names. Perhaps increase length?") + } + unlist(l) +} + +#' @import stats +makeTempVarName <- function(data=NULL, names=NULL, + pre=NULL, post=NULL, length = 10L) { + DN <- NULL + DN <- c(DN, names(data)) + DN <- c(DN, names) + DN <- unique(DN) + + if (length(pre) != length(post) && length(post) > 0L && length(pre) > 0L) { + stop("Lengths of arguments 'pre' and 'post' differ (", length(pre), " vs. ", + length(post), "). (Tried to create temporary variables, so this is ", + "most likely an internal error and the pkg maintainer should be ", + "complained to.)") + } + useN <- max(length(pre), length(post), 1L) + useL <- length + tv <- temp_var_names(avoid = DN, n = useN, length = useL) + tv <- paste0(pre, tv, post) + tv +} + + +setDFpe <- function(x) { + ## intended to only be used to set data.table to data.frame in place + ## when option("popEpi.datatable") == FALSE + if (!is.data.table(x)) stop("only accepts data.table as input") + + cl <- class(x) + wh <- which(cl == "data.table") + cl = c(cl[1:(wh-1)], cl[(wh+1):length(cl)]) + setattr(x, "class", cl) + + setattr(x, "sorted", NULL) + setattr(x, ".internal.selfref", NULL) +} + + + +evalLogicalSubset <- function(data, substiset, n = 2, enclos = parent.frame(n)) { + ## NOTE: subset MUST be substitute()'d before using this function! + ## we allow substiset to be a logical condition only + ## ALWAYS returns a logical vector of length nrow(data) + + substiset <- eval(substiset, envir = data, enclos = enclos) + if (!is.null(substiset)) { + if (!is.logical(substiset)) stop("Expression to subset by must be a logical condition, e.g. var1 == 0, var1 %in% 1:2, var1 > 0, etc.") + substiset <- substiset & !is.na(substiset) + if (sum(substiset) == 0) stop("zero rows in data after subset") + } else { + substiset <- rep(TRUE, nrow(data)) + } + substiset +} + + +subsetDTorDF <- function(data, subset=NULL, select=NULL) { + ## INTENTION: subsetting either a data.table or a data.frame + ## and returning only selected variables for lazy people. + if (!is.data.frame(data)) stop("data must be a data.table/data.frame") + if (!is.logical(subset) && !is.null(subset)) stop("subset must be a logical vector or NULL") + + if (is.null(select)) { + select <- names(data) + } else { + all_names_present(data, select) + } + + e <- "data[" + if (!is.null(subset) && !all(subset)) e <- paste0(e, "subset") + if (!is.null(select) && (length(select) < names(data) || any(select != names(data)))) { + e <- paste0(e, ", eval(select)") + if (is.data.table(data)) e <- paste0(e, ", with = FALSE") + } + e <- paste0(e, "]") + + e <- parse(text = e) + + eval(e) + +} + +subsetRolling <- function(data, subset = NULL, select = NULL) { + ## INTENTION: subsets a data.table column by column and by deleting columns + ## in the old data.table. + if (!is.data.table(data)) stop("data must be a data.table") + if (!is.logical(subset)) stop("subset must be a logical vector") + + if (is.null(select)) { + select <- names(data) + } else { + all_names_present(data, select) + } + + if (length(select) == 0L) stop("select is of length zero, which would delete all columns in data") + + setcolsnull(data, keep = select) + + dt <- data[subset, select[1L], with = FALSE] + + setcolsnull(data, delete = select[1L]) + select <- select[-1L] + + for (v in select) { + set(dt, j = v, value = data[[v]][subset]) + set(data, j = v, value = NULL) + } + + rm(list = deparse(substitute(data)), envir = parent.frame(1L)) + + dt +} + + + +setDT2DF <- function(x) { + if (!is.data.table(x)) stop("only accepts data.table as input") + + cl <- class(x) + cl <- setdiff(cl, "data.table") + setattr(x, "class", cl) + setattr(x, "sorted", NULL) + setattr(x, ".internal.selfref", NULL) + invisible(x) +} + +setDF2DT <- function(x) { + if (!is.data.frame(x) || is.data.table(x)) stop("only accepts data.frame as input") + + cl <- class(x) + whDF <- which(cl == "data.frame") + cl <- c(cl[1:(whDF-1)], "data.table", "data.frame", cl[whDF:length(cl)]) + + setattr(x, "class", cl) + alloc.col(x) + + invisible(x) +} + + + + +p.round <- function(p, dec=3) { + th <- eval( parse(text=paste0('1E-', dec ) )) + if( is.null(p)) return( '= NA') + if( is.na(p)) return( '= NA') + if( p < th ){ + p <- paste0('< ', th ) + } else { + p <- paste0('= ', round(p, dec) ) + } + p +} + + +cutLow <- function(x, breaks, tol = .Machine$double.eps^0.5) { + ## a cut function that returns the lower bounds of the cut intervals (as numeric) as levels + + breaks <- sort(breaks) + x <- cut(x + tol, right = FALSE, breaks = breaks, labels = FALSE) + x <- breaks[-length(breaks)][x] + x +} + + + + +setcols <- function(x, j, value) { + ## intention: add new columns to DT via modifying in place, and to DF + ## via DF$var <- value; both conserve memory (don't take copy of whole data) + + if (!is.data.frame(x)) stop("x must be a data.frame") + if (!is.list(value)) stop("value must be a list of values (columns to add)") + if (missing(j)) j <- names(value) + + if (!is.data.table(x)) { + x[j] <- value + } else { + set(x, j = j, value = value) + } + x +} + + + + +cutLowMerge <- function(x, y, by.x = by, by.y = by, by = NULL, all.x = all, all.y = all, all = FALSE, mid.scales = c("per", "age"), old.nums = TRUE) { + ## INTENTION: merges y to x by by.x & by.y after cutLow()'ing appropriate + ## variables in x so that y's values match with x's values + ## requirements; + ## * numeric variables in y correspond to lower limits of some intervals OR + ## are group variables (e.g. sex = c(0,1)) + ## inputs: two datas as in merge, preferably both data.table, and other args + ## to merge() + ## output: a data.table where y has been merged to x after cutLow() + ## example: merging popmort to a split Lexis object, where popmort's variables + ## correspond to at least some Lexis time scales + ## old.nums: return old numeric variable values used in cutLow()'ing? + ## mid.scales: use mid-point of interval when merging by these Lexis time scales + ## computed by adding + 0.5*lex.dur, which must exist + + if (!is.data.table(x)) { + stop("x must be a data.table") + } + + if ((is.null(by.x) && !is.null(by.y)) || (!is.null(by.x) && is.null(by.y))) { + stop("one but not both of by.x / by.y is NULL") + } + if (!is.null(by)) by.x <- by.y <- by + + if (length(by.x) != length(by.y)) stop("lengths differ for by.y & by.x") + all_names_present(x, by.x) + all_names_present(y, by.y) + names(by.x) <- by.y + names(by.y) <- by.x + + if (length(mid.scales)>0) all_names_present(x, c("lex.dur", mid.scales)) + + whScale <- by.x %in% mid.scales + xScales <- by.x[whScale] + yScales <- by.y[whScale] + + if (length(yScales) > 0) { + + oldVals <- copy(with(x, mget(xScales))) + on.exit(set(x, j = xScales, value = oldVals)) + setattr(oldVals, "names", yScales) + + for (yVar in yScales) { + xVar <- xScales[yVar] + xBr <- sort(unique(y[[yVar]])) + xBr <- unique(c(xBr, Inf)) + set(x, j = xVar, value = cutLow(x[[xVar]] + x$lex.dur*0.5, breaks = xBr)) + } + + } + + ## ensure x retains order (no copy taken of it) + xKey <- key(x) + if (length(xKey) == 0) { + xKey <- makeTempVarName(x, pre = "sort_") + on.exit(if ("x" %in% ls()) setcolsnull(x, delete = xKey, soft = TRUE), add = TRUE) + on.exit(if ("z" %in% ls()) setcolsnull(z, delete = xKey, soft = TRUE), add = TRUE) + x[, (xKey) := 1:.N] + } + + if (any(duplicated(y, by = by.y))) { + stop("y is duplicated by the inferred/supplied by.y variables (", + paste0("'", by.y, "'", collapse = ", "), "). ", + "First ensure this is not so before proceeding.") + } + + ## avoid e.g. using merge.Lexis when x inherits Lexis + xClass <- class(x) + on.exit({ + setattr(x, "class", xClass) + }, add = TRUE) + setattr(x, "class", c("data.table", "data.frame")) + + ## return old numeric values of variables that were cutLow()'d + ## by keeping them + if (old.nums && length(xScales)) { + tmpXScales <- makeTempVarName(names = c(names(x), names(y)), pre = xScales) + set(x, j = tmpXScales, value = oldVals) + on.exit({ + xOrder <- setdiff(names(x), tmpXScales) + setcolsnull(x, delete = xScales, soft = TRUE) + setnames(x, tmpXScales, xScales) + setcolorder(x, xOrder) + + }, add = TRUE) + } + + ## merge + z <- merge(x, y, by.x = by.x, by.y = by.y, + all.x = all.x, all.y = all.y, all = all, + sort = FALSE) + + setDT(z) + if (old.nums && length(xScales)) { + ## avoid warning due to coercing double to integer + set(z, j = xScales, value = NULL) + setnames(z, tmpXScales, xScales) + } + + zOrder <- intersect(names(x), names(z)) + zOrder <- c(zOrder, setdiff(names(z), names(x))) + setcolorder(z, zOrder) + if (length(xKey) > 0) setkeyv(z, xKey) + z[] + +} + + +getOrigin <- function(x) { + ## input: Date, IDate, or date variable + ## output: the origin date in Date format, + ## the origin date being the date where the underlying index is zero. + if (inherits(x, "Date") || inherits(x, "IDate")) { + as.Date("1970-01-01") + } else if (inherits(x, "date")) { + as.Date("1960-01-01") + } else if (inherits(x, "dates")) { + as.Date(paste0(attr(x, "origin"), collapse = "-"), format = "%d-%m-%Y") + } else { + stop("class '", class(x), "' not supported; usage of Date recommended - see ?as.Date") + } + +} + +promptYN <- function(q) { + + rl <- readline(prompt = paste0(q, " (Y/N) ::: ")) + y <- c("y", "Y") + n <- c( "n", "N") + if (!rl %in% c(y,n)) { + cat("Answer must be one of the following (without ticks):", paste0("'",c(y, n),"'", collapse = ", ")) + promptYN(q = q) + } + + if (rl %in% y) TRUE else FALSE + +} + + + +oneWhitespace <- function(x) { + if (!is.character(x)) stop("x not a character") + x <- paste0(x, collapse = " ") + while(sum(grep(pattern = " ", x = x))) { + x <- gsub(pattern = " ", replacement = " ", x = x) + } + x +} + + +aliased_cols <- function(data, cols) { + + if (missing(cols)) cols <- names(data) + all_names_present(data, cols) + + if (length(cols) < 2L) return(invisible()) + + x <- with(data, mget(cols)) + x <- lapply(x, duplicated) + + sub_cols <- cols + tl <- list() + ## loop: each step reduce vector of names by one + ## to avoid testing the same variables twice (in both directions) + tick <- 0L + aliased <- FALSE + while (!aliased && length(sub_cols) > 1L && tick <= length(cols)) { + + currVar <- sub_cols[1L] + sub_cols <- setdiff(sub_cols, currVar) + tl[[currVar]] <- unlist(lapply(x[sub_cols], function(j) identical(x[[currVar]], j))) + aliased <- sum(tl[[currVar]]) + + tick <- tick + 1L + } + + if (tick == length(cols)) warning("while loop went over the number of columns argument cols") + + ## result: list of logical vectors indicating if a column is aliased + ## with other columns + tl[vapply(tl, function(j) sum(j) == 0L, logical(1))] <- NULL + + if (length(tl) == 0L) return(invisible()) + + ## take first vector for reporting + var <- names(tl)[1L] + aliases <- names(tl[[1L]])[tl[[1]]] + aliases <- paste0("'", aliases, "'", collapse = ", ") + stop("Variable '", var, "' is aliased with following variable(s): ", aliases, ".") + + invisible() +} + + + + + + + +return_DT <- function() { + + x <- getOption("popEpi.datatable") + if (!is.null(x) && !is.logical(x)) { + stop("the option 'popEpi.datatable' must be either NULL or a logical ", + "value (TRUE / FALSE).") + } + if (is.null(x) || isTRUE(x)) { + return(TRUE) + } + return(FALSE) + +} + + + + +#' @title Create a Lexis Object with Follow-up Time, Period, and Age +#' Time Scales +#' @description +#' This is a simple wrapper around \code{\link[Epi]{Lexis}} for creating +#' a \code{Lexis} object with the time scales \code{fot}, \code{per}, +#' and \code{age}. +#' @param data a \code{data.frame}; mandatory +#' @param birth the time of birth; A character string naming the variable in +#' data or an expression to evaluate - see +#' \link[=flexible_argument]{Flexible input} +#' @param entry the time at entry to follow-up; supplied the +#' same way as \code{birth} +#' @param exit the time at exit from follow-up; supplied the +#' same way as \code{birth} +#' @param entry.status passed on to \code{\link[Epi]{Lexis}} if not \code{NULL}; +#' supplied the same way as \code{birth} +#' @param exit.status passed on to \code{\link[Epi]{Lexis}} if not \code{NULL}; +#' supplied the same way as \code{birth} +#' @param subset a logical condition to subset by before passing data +#' and arguments to \code{\link[Epi]{Lexis}} +#' @param ... additional optional arguments passed on to +#' \code{\link[Epi]{Lexis}} +#' @return +#' A \code{Lexis} object with the usual columns that \code{Lexis} objects +#' have, with time scale columns \code{fot}, \code{per}, and \code{age}. +#' They are calculated as +#' +#' \code{fot = entry - entry} (to ensure correct format, e.g. difftime) +#' +#' \code{per = entry} +#' +#' and +#' +#' \code{age = entry - birth} +#' +#' @examples +#' +#' data("sire", package = "popEpi") +#' +#' lex <- Lexis_fpa(sire, +#' birth = "bi_date", +#' entry = dg_date, +#' exit = ex_date + 1L, +#' exit.status = "status") +#' +#' ## some special cases +#' myVar <- "bi_date" +#' l <- list(myVar = "bi_date") +#' sire$l <- sire$myVar <- 1 +#' +#' ## conflict: myVar taken from data when "bi_date" was intended +#' lex <- Lexis_fpa(sire, +#' birth = myVar, +#' entry = dg_date, +#' exit = ex_date + 1L, +#' exit.status = "status") +#' +#' ## no conflict with names in data +#' lex <- Lexis_fpa(sire, +#' birth = l$myVar, +#' entry = dg_date, +#' exit = ex_date + 1L, +#' exit.status = "status") +#' @export +Lexis_fpa <- function(data, + birth = NULL, + entry = NULL, + exit = NULL, + entry.status = NULL, + exit.status = NULL, + subset = NULL, + ...) { + if (!requireNamespace("Epi", quietly = TRUE)) { + stop("Install package Epi before using this function.") + } + TF <- environment() + PF <- parent.frame(1L) + + checkVars <- c("fot", "per", "age", + paste0("lex.", c("dur", "Xst", "Cst", "id"))) + checkVars <- intersect(names(data), checkVars) + if (length(checkVars)) { + stop("Following variable name(s) reserved but exist in data: ", + paste0(checkVars, collapse = ", ")) + } + + + sb <- substitute(subset) + subset <- evalLogicalSubset(data, sb, enclos = PF) + if (all(subset)) subset <- NULL + x <- subsetDTorDF(data = data, subset = subset) + setDT(x) + + an <- c("birth", "entry", "exit", "entry.status", "exit.status") + + l <- vector("list", length(an)) + names(l) <- an + for (stri in an) { + e <- paste0("substitute(", stri, ", env = TF)") + e <- parse(text = e)[[1]] + e <- eval(e, envir = TF) ## e.g. result of substitute(birth) + e <- evalPopArg(data = x, arg = e, enclos = PF)[[1]] + l[[stri]] <- e + } + + l[sapply(l, is.null)] <- NULL + + missVars <- setdiff(c("birth", "entry", "exit"), names(l)) + if (length(missVars)) { + stop("Following mandatory arguments were NULL: ", + paste0(missVars, collapse = ", ")) + } + + fot <- l$entry - l$entry + per <- l$entry + age <- l$entry - l$birth + per_exit <- l$exit + + en <- list(fot = fot, per = per, age = age) + ex <- list(per = per_exit) + + al <- list(entry = en, exit = ex, entry.status = l$entry.status, + exit.status = l$exit.status, data = x) + al[sapply(al, is.null)] <- NULL + + do.call(Epi::Lexis, args = c(al, ...)) +} + + + + + + + +get_breaks <- function(x) { + UseMethod("get_breaks") +} + +get_breaks.survtab <- function(x) { + + ss <- attributes(x)$survtab.meta$surv.scale + sb <- attributes(x)$survtab.meta$surv.breaks + + l <- list(sb) + names(l) <- ss + as.list(l) + +} + + +get_breaks.aggre <- function(x) { + + as.list(attributes(x)$aggre.meta$breaks) + +} + +get_breaks.Lexis <- function(x) { + as.list(attributes(x)$breaks) +} + +get_breaks.default <- function(x) { + NULL +} + + +select_breaks <- function(data, ...) { + UseMethod("select_breaks") +} + +select_breaks.default <- function(data, ts, br = NULL, ...) { + br <- do_select_breaks(data = data, ts = ts, br = br) + if (is.null(br)) { + stop("Data did not contain breaks and no breaks were supplied ", + "by hand.") + } + br +} + +select_breaks.aggre <- function(data, ts, br = NULL, ...) { + + + br <- do_select_breaks(data = data, ts = ts, br = br) + + select_breaks_subcheck(br, get_breaks(data)[[ts]], + "Manually supplied breaks were not a ", + "subset of the breaks in aggre data. ", + "Data has breaks as a result of being split and ", + "aggregated; see ?as.aggre and ?aggre") + + if (is.null(br)) { + stop("aggre object did not contain breaks and no breaks were supplied ", + "by hand.") + } + + br +} + +select_breaks.Lexis <- function(data, ts, br = NULL, ...) { + + checkLexisData(data) + + br <- do_select_breaks(data = data, ts = ts, br = br) + + select_breaks_subcheck(br, get_breaks(data)[[ts]], + "Manually supplied breaks were not a ", + "subset of the breaks in Lexis data. ", + "Data has breaks as a result of being a split Lexis ", + "object; see ?Lexis and e.g. ?splitMulti") + + if (is.null(br)) { + stop("Lexis object did not contain breaks and no breaks were supplied ", + "by hand.") + } + bl <- list(br) + names(bl) <- ts + checkBreaksList(data, breaks = bl) + + br +} + + +select_breaks_subcheck <- function(b1, b2, ...) { + l1 <- list(b1) + l2 <- list(b2) + names(l1) <- names(l2) <- "TS" + + if (!is.null(b1) && !is.null(b2) && !all_breaks_in(l1, l2)) { + stop(...) + } +} + +do_select_breaks <- function(data, ts, br = NULL) { + # @description selects breaks from data or from br depending on + # which one is NULL. If both exist, br must be a subset of the breaks + # in data. + + stopifnot(is.data.frame(data)) + stopifnot(is.character(ts) && length(ts) == 1L && ts %in% names(data)) + + dbr <- get_breaks(data)[[ts]] + + dbl <- list(dbr) + bl <- list(br) + names(dbl) <- names(bl) <- "TS" + + + + if (is.null(br)) br <- dbr + + br +} + + + + +breaks_in_data <- function(br, ts, data) { + ## note: last break does not usually appear in data, unless intentionally + ## limiting from e.g. 0:5 to 0:4 + stopifnot(length(ts) == 1 && ts %in% names(data)) + u <- unique(data[[ts]]) + + br <- sort(unique(br)) + if (length(br)<2) stop("There must be at least two breaks to form intervals") + + br <- if (max(br) <= max(u)) br else br[-length(br)] + all(br %in% u) + +} + + + + + +is_named_list <- function(x) is.list(x) && length(unique(names(x))) == length(x) + + + + +fuse_breakslists <- function(bl.old, bl.new, drop) { + # @description given two lists of breaks, uses all timescales found + # in both lists to fuse into one list. For common timescales an + # interval-based subset is taken, so that the new always limits the old + # when drop = TRUE. + + stopifnot( + is_named_list(bl.old), is_named_list(bl.new) + ) + + bl <- bl.old + new_scales <- setdiff(names(bl.old), names(bl.new)) + if (length(new_scales)) { + bl[new_scales] <- bl.new[new_scales] + } + common_scales <- intersect(names(bl.old), names(bl.new)) + if (length(common_scales)) { + + bl[common_scales] <- lapply(common_scales, function(time_scale) { + new <- bl.new[[time_scale]] + old <- bl.old[time_scale] + fuse <- sort(union(old, new)) + if (drop) { + r.new <- range(new) + r.old <- range(old) + r <- c(max(r.new[1], r.old[1]), min(r.new[2], r.old[2])) + fuse <- fuse[between(fuse, r[1], r[2], incbounds = TRUE)] + } + fuse + }) + + } + + bl + +} + + + + + + +set2 <- function(x, j, ...) { + cols_exst <- intersect(names(x), j) + old_order <- copy(names(x)) + if (length(cols_exst)) { + set(x, j = cols_exst, value = NULL) + } + set(x = x, j = j, ...) + new_cols <- setdiff(names(x), old_order) + setcolorder(x, c(old_order, new_cols)) + invisible(x) +} + + + + + +mget_cols <- function(cols, data) { + + stopifnot(all(cols %in% names(data))) + + setDT(mget(x = cols, envir = as.environment(data), inherits = FALSE)) +} + + + + + +get_random_seed <- function() { + t <- Sys.time() + s <- as.numeric(t) %% as.integer(t) + nc <- nchar(s) + s <- as.integer(substr(s, nc-8, nc)) + s +} + + + + + +skip_usually <- function() { + requireNamespace("testthat") + testthat::skip_on_cran() + testthat::skip_on_travis() + testthat::skip_on_appveyor() +} + + + + + + + + + + + + diff --git a/R/weighted_table.R b/R/weighted_table.R index 4f4d76e..63d1d9d 100644 --- a/R/weighted_table.R +++ b/R/weighted_table.R @@ -1,664 +1,662 @@ -#' @title Make a \code{data.table} of Tabulated, Aggregated Values and Weights -#' @description An internal function that aggregates a table -#' and merges in weights. -#' @param data DF/DT; passed to \code{envir} in \code{eval} -#' @param values values to tabulate. Anything \code{evalPopArg} can evaluate. -#' @param print variables to tabulate by and include in \code{prVars} in attributes -#' @param adjust variables to tabulate by and include in \code{adVars} in attributes -#' @param formula a formula such as \code{fot ~ sex} or \code{Surv(fot, lex.Xst) ~ sex} -#' @param Surv.response logical, if \code{TRUE} throws error if response in -#' \code{formula} is not a \code{Surv} object and vice versa -#' @param by.other other variables to tabulate by and include -#' in \code{boVars} in attributes -#' @param custom.levels a named list of values. When "inflating" the data -#' in the cross-join / cartesian join sense (try e.g. \code{merge(1:5, 1:2)}), -#' one can supply the levels to inflate by using this to ensure inflation is full. -#' E.g. data might only have levels present to do inflation analogous to -#' \code{merge(2:5, 1:2)} although \code{merge(1:5, 1:2)} is intended and -#' needed. -#' @param custom.levels.cut.low a character string vector of variable names. -#' These variables mentioned in \code{custom.levels} and existing in data -#' or first modified (in data) using \code{cutLow()} (essentially -#' \code{cut()} with \code{right = FALSE} and returning the lower bounds -#' as values). Handy for aggregating data e.g. to survival intervals. -#' \strong{NOTE}: the appropriate elements in \code{custom.levels} for these -#' variables must exceptionally contain an extra value as the roof used in -#' cutting, which will not be used in "inflating" the table using a merge. -#' See Examples. -#' @param weights a named list or long-form data.frame of weights. See Examples. -#' @param internal.weights.values the variable to use to compute internal -#' weights; only used if \code{weights = "internal"}. -#' @param enclos the enclosing environment passed on to \code{eval}. Variables -#' not found in \code{data} or searched for here. -#' @param NA.text a character string to display in a \code{warning} -#' if there are any rows with missing \code{values} or \code{adjust} values. -#' \strong{special:} key phrase \code{\%\%NA_COUNT\%\%} in text is replaced -#' with the count of missing observations. -#' E.g. \code{"Missing \%\%NA_COUNTS\%\% observations due to derpness."} -#' @examples -#' library(survival) -#' library(data.table) -#' -#' makeWeightsDT <- popEpi:::makeWeightsDT ## this avoids errors during tests -#' -#' sire <- copy(popEpi::sire) -#' set.seed(1L) -#' sire$sex <- rbinom(nrow(sire), 1, 0.5) -#' ag <- lexpand(sire, birth = "bi_date", entry = "dg_date", exit = "ex_date", -#' status = status %in% 1:2, pophaz = popmort, pp = FALSE, -#' aggre = list(sex, agegr = cut(dg_age, c(0,50,75,Inf)), fot), -#' fot = seq(0, 5, 1/12)) -#' ps <- quote(list(sex, fot)) -#' as <- quote(list(agegr)) -#' vs <- list(quote(list(pyrs, at.risk))) -#' ws <- list(agegr = c(0.2,0.4,0.4)) -#' -#' #### custom.levels usage -#' fb <- seq(0, 5-1/12, 1/12) ## exclude 5 as no row has that value -#' ag2 <- ag[fot > 0.5,] -#' # repeats fot intervals < 0.5 as empty rows -#' # may be the safest way to do this -#' dt <- makeWeightsDT(ag2, print = ps, adjust = as, -#' values = vs, weights = ws, -#' custom.levels = list(fot = fb)) -#' ## aggregate from intervals seq(0, 5, 1/12) to 0:5 -#' fb2 <- 0:5 ## (this time we include 5 as the roof) -#' dt <- makeWeightsDT(ag2, print = ps, adjust = as, -#' values = vs, weights = ws, -#' custom.levels = list(fot = fb2), -#' custom.levels.cut.low = "fot") -#' -#' -#' #### use of enclos -#' TF <- environment() -#' gender <- factor(ag$sex) -#' dt <- makeWeightsDT(ag, print = quote(gender), adjust = as, -#' values = vs, weights = ws, enclos = TF) -#' ## or NULL: uses calling frame by default. -#' dt <- makeWeightsDT(ag, print = quote(gender), adjust = as, -#' values = vs, weights = ws, -#' enclos = NULL) -#' ## passing parent.fram(1) is the same thing (as below), -#' ## but won't pass in testing these examples somehow (but work in real life) -#' # dt <- makeWeightsDT(ag, print = quote(gender), adjust = as, -#' # values = vs, weights = ws, -#' # enclos = NULL) -#' -#' #### formula usage -#' form <- Surv(fot, factor(from0to1))~gender -#' dt <- makeWeightsDT(ag, formula = form, Surv.response = TRUE, -#' adjust = as, values = vs, weights = ws, -#' enclos = NULL) -#' -#' ## or -#' form <- Surv(fot, factor(from0to1))~gender + adjust(agegr) -#' dt <- makeWeightsDT(ag, formula = form, Surv.response = TRUE, -#' adjust = NULL, values = vs, weights = ws, -#' enclos = NULL) -#' -#' ## or -#' form <- from0to1 ~ fot + gender + adjust(agegr) -#' dt <- makeWeightsDT(ag, formula = form, Surv.response = FALSE, -#' adjust = NULL, values = vs, weights = ws, -#' enclos = NULL) -#' -#' form <- from0to1 ~ fot + adjust(agegr) + adjust(sex) -#' ws2 <- list(agegr = c(0.33, 0.33, 0.33), sex = c(0.5, 0.5)) -#' dt <- makeWeightsDT(ag, formula = form, Surv.response = FALSE, -#' adjust = NULL, values = vs, weights = ws2, -#' enclos = NULL) -#' -#' ## international standard pops -#' ag <- lexpand(sire, birth = "bi_date", entry = "dg_date", exit = "ex_date", -#' status = status %in% 1:2, pophaz = popmort, pp = FALSE, -#' aggre = list(sex, agegr = cut(dg_age, c(seq(0, 85, 5), Inf)), fot), -#' fot = seq(0, 5, 1/12)) -#' -#' form <- from0to1 ~ fot + adjust(agegr) -#' dt <- makeWeightsDT(ag, formula = form, Surv.response = FALSE, -#' adjust = NULL, values = vs, weights = "world_1966_18of5", -#' enclos = NULL) -#' -#' form <- from0to1 ~ fot + adjust(agegr, sex) -#' dt <- makeWeightsDT(ag, formula = form, Surv.response = FALSE, -#' adjust = NULL, values = vs, -#' weights = list(agegr = "nordic_2000_18of5", sex=c(1,1)), -#' enclos = NULL) -makeWeightsDT <- function(data, values = NULL, - print = NULL, adjust = NULL, - formula = NULL, Surv.response = TRUE, - by.other = NULL, custom.levels = NULL, - custom.levels.cut.low = NULL, weights = NULL, - internal.weights.values = NULL, - enclos = NULL, NA.text = NULL) { - - # environmentalism ----------------------------------------------------------- - TF <- environment() - PF <- parent.frame(1L) - if (missing(enclos) || is.null(enclos)) { - enclos <- PF - } - - enclos <- eval(enclos, envir = TF) - - if (!is.environment(enclos)) { - stop("Argument 'enclos' is not an environment. (Probably internal error, ", - "meaning you should complain to the package maintainer if you are not", - "doing something silly.)") - } - - THIS_CALL <- match.call() - - ## dataism ------------------------------------------------------------------- - if (!is.data.frame(data)) stop("data must be a data.frame") - ## tmpDum for convenience. will be deleted in the end. (if no tabulating vars) - origData <- data - tmpDum <- makeTempVarName(origData, pre = "dummy_") - data <- data.table(rep(1L, nrow(origData))) - setnames(data, 1, tmpDum) - - - # formula: vars to print and adjust by --------------------------------------- - - adSub <- adjust - adjust <- evalPopArg(data = origData, arg = adSub, DT = TRUE, enclos = enclos) - - if (!is.null(formula)) { - - foList <- usePopFormula(formula, adjust = adjust, - data = origData, enclos = enclos, - Surv.response = Surv.response) - print <- foList$print - adjust <- foList$adjust - } else { - - prSub <- substitute(print) - print <- evalPopArg(data = origData, arg = prSub, DT = TRUE, enclos = enclos) - - - } - - if (length(weights) && length(adjust)) { - checkWeights(weights, adjust = adjust) - } - - # variables to print by ---------------------------------------------------- - prVars <- tmpDum - if (length(print) > 0) { - prVars <- names(print) - data[, c(prVars) := TF$print] - data[, c(tmpDum) := NULL] - } - rm(print) - - # standardization ---------------------------------------------------------- - ## note: adjust evaluated above with formula - adVars <- NULL - if (length(adjust) > 0) { - adVars <- names(adjust) - data[, c(adVars) := TF$adjust] - } - rm(adjust) - - if (is.null(weights) && length(adVars)) { - stop("Variables to adjust by were defined but no weights were supplied.") - } - - if (!length(adVars)) { - - if (!is.null(weights)) { - message("NOTE: Weights ignored since no adjusting variables given") - } - - weights <- NULL - } - - # variables to sum ----------------------------------------------------------- - if (!is.list(values)) stop("Argument 'values' must be a list ", - "(internal error: complain to the package", - "maintainer if you see this)") - values <- lapply(values, function(x) { - evalPopArg(data = origData, arg = x, DT = TRUE, enclos = enclos) - }) - for (dt in setdiff(seq_along(values), 1L)) { - values[[1L]] <- cbind(values[[1L]], values[[dt]]) - } - values <- values[[1L]] - vaVars <- NULL - if (nrow(values) != nrow(data)) { - stop("mismatch in numers of rows in data (", nrow(data), - ") and 'values' (", nrow(values), "). If you see this message, ", - "complain to the package maintainer.") - } - - if (length(values) > 0) { - vaVars <- names(values) - data[, c(vaVars) := TF$values] - } else { - stop("no values given to sum!") - } - rm(values) - - # additionally, values to compute internal weights by: ----------------------- - iwVar <- NULL - if (is.character(weights) && - pmatch(weights, c("internal", "cohort"), nomatch = 0L)) { - iw <- substitute(internal.weights.values) - iw <- evalPopArg(data = origData, iw, DT = TRUE, - enclos = PF, recursive = TRUE, - types = c("character", "expression", "list", "NULL")) - - if (length(iw) > 1L) stop("Argument 'internal.weights.values' ", - "must produce only one column.") - if (length(iw) == 1L && is.character(weights) && - pmatch(weights, c("internal", "cohort"), nomatch = 0L)) { - iwVar <- makeTempVarName(names=c(names(data), names(origData)), pre = "iw_") - data[, c(iwVar) := TF$iw] - } - - if (length(iwVar) == 0L) { - stop("Requested computing internal weights, but no values to compute ", - "internals weights with were supplied (internal error: If you see ", - "this, complain to the package maintainer).") - } - rm(iw) - } - - - # other category vars to keep ------------------------------------------------ - boSub <- by.other - by.other <- evalPopArg(data = origData, arg = boSub, DT = TRUE, enclos = enclos) - boVars <- NULL - if (length(by.other) > 0) { - boVars <- names(by.other) - data[, c(boVars) := TF$by.other] - } - rm(by.other) - - # check for aliased columns -------------------------------------------------- - aliased_cols(data, cols = c(prVars, adVars, boVars)) - - # check for conflicting column names ----------------------------------------- - dupCols <- c(prVars, adVars, boVars, vaVars, iwVar) - dupCols <- unique(dupCols[duplicated(dupCols)]) - if (length(dupCols) > 0L) { - dupCols <- paste0("'", dupCols, "'", collapse = ", ") - stop("Following column names duplicated (columns created by arguments ", - "print, adjust, etc.): ", dupCols, ". If you see this, please ensure ", - "you are not passing e.g. the same column to both for adjusting ", - "and stratification (printing).") - } - - # check for NA values -------------------------------------------------------- - ## NOTE: NA values of print/by.other are OK. values/adjust are NOT. - - NAs <- data[, lapply(.SD, function(x) is.na(x)), .SDcols = c(vaVars, iwVar, adVars)] - NAs <- rowSums(NAs) > 0L - if (sum(NAs)) { - if (!is.null(NA.text)) { - NA.text <- gsub(x = NA.text, pattern = "%%NA_COUNT%%", - replacement = sum(NAs)) - warning(NA.text) - } - data <- data[!NAs] - } - - # inflate data --------------------------------------------------------------- - ## on the other hand we aggregate data to levels of print, adjust and - ## by.other; on the other hand the data will always have tabulating variables - ## represented as cross-joined, e.g. merge(1:5, 1:2). - ## this means some rows might have zeros as values in the 'values' - ## columns. - ## (necessary for correct standardization with weights) - - ## NOTE: have to do CJ by hand: some levels of adjust or something may not - ## have each level of e.g. fot repeated! - - sortedLevs <- function(x) { - if (!is.factor(x)) return(sort(unique(x))) - - factor(levels(x), levels(x), levels(x)) - } - cj <- list() - cj <- lapply(data[, .SD, .SDcols = c(prVars, adVars, boVars)], sortedLevs) - - ## e.g. if data only has fot = seq(0, 4, 1/12), but want to display table - ## with fot = seq(0, 5, 1/12). Very important sometimes for handy usage - ## of weights. - if (length(custom.levels) > 0) cj[names(custom.levels)] <- custom.levels - - - ## SPECIAL: if e.g. a survival time scale with breaks seq(0, 5, 1/12) - ## is to be "compressed" to breaks 0:5, and the latter breaks were passed - ## via custom.levels, the following ensures e.g. intervals between 0 and 1 - ## are aggregated to the same row in what follows after. - if (!is.null(custom.levels.cut.low)) { - cl_msg <- paste0("Internal error: tried to cut() variables in ", - "internally used work data that did not exist. ", - "If you see this, complain to the ", - "package maintainer. Bad variables: %%VARS%%.") - all_names_present(data, custom.levels.cut.low, msg = cl_msg) - all_names_present(cj, custom.levels.cut.low, msg = cl_msg) - - for (var in custom.levels.cut.low) { - set(data, j = var, value = cutLow(data[[var]], breaks = cj[[var]])) - } - - ## NOTE: if used cutlow(), then assume passed values via custom.levels - ## also contained the roof of the values which should not be repeated. - cj[custom.levels.cut.low] <- lapply(cj[custom.levels.cut.low], - function(elem) elem[-length(elem)]) - } - - ## form data.table to merge by - the merge will inflate the data. - cj <- do.call(function(...) CJ(..., unique = FALSE, sorted = FALSE), cj) - - ## inflate & aggregate. - setkeyv(data, c(prVars, adVars, boVars)) - data <- data[cj, lapply(.SD, sum), .SDcols = c(vaVars, iwVar), by = .EACHI] - - for (k in c(vaVars, iwVar)) { - data[is.na(get(k)), (k) := 0] - } - - setcolsnull(data, tmpDum) - prVars <- setdiff(prVars, tmpDum); if (length(prVars) == 0) prVars <- NULL - - ## merge in weights ---------------------------------------------------------- - - if (!is.null(weights)) { - - if (is.list(weights) && !is.data.frame(weights)) { - ## in case one of the elements is a standardization scheme string, - ## such as "world_x_y". - whChar <- which(unlist(lapply(weights, is.character))) - if (sum(whChar)) { - - weights[whChar] <- lapply(weights[whChar], function(string) { - ## expected to return data.frame with 1) age groups 2) weights - ## as columns. - stdr.weights(string)[[2]] - }) - - } - - ## now list only contains numeric weights i hope. - - } - - ## NOTE: adjust used here to contain levels of adjust arguments only - adjust <- list() - if (length(adVars) > 0L) { - adjust <- lapply(data[, eval(adVars), with = FALSE], sortedLevs) - } - - if (is.character(weights)) { - - if (pmatch(weights, c("internal", "cohort"), nomatch = 0L)) { - - - all_names_present(data, iwVar, - msg = paste0( - "Internal error: expected to have variable ", - "%%VARS%% in working data but didn't. Complain ", - "to the pkg maintainer if you see this." - )) - - weights <- lapply(seq_along(adjust), function(i) { - cn <- names(adjust)[i] - le <- unique(adjust[[i]]) - le <- structure(list(le), names = cn) - data[le, lapply(.SD, sum), .SDcols = iwVar, by = .EACHI, on = cn][[iwVar]] - }) - names(weights) <- names(adjust) - - - setcolsnull(data, iwVar) - setkeyv(data, c(prVars, adVars, boVars)) - } else { - ## expected to return data.frame with 1) age groups 2) weights - ## as columns. - weights <- stdr.weights(weights) - weights <- weights[[2]] - - } - } - - if (!is.data.frame(weights) && is.vector(weights)) { - ## note: lists are vectors - if (!is.list(weights)) { - weights <- list(weights) ## was a vector of values - setattr(weights, "names", adVars[1]) - } - - weVars <- names(weights) - weights <- weights[names(adjust)] - - adjust <- do.call(function(...) CJ(..., unique = FALSE, sorted = FALSE), adjust) - weights <- do.call(function(...) CJ(..., unique = FALSE, sorted = FALSE), weights) - - weVars <- paste0(weVars, ".w") - setnames(weights, adVars, weVars) - weights[, (adVars) := adjust] - - set(weights, j = "weights", value = 1L) - for (k in weVars) { - set(weights, j = "weights", value = weights$weights * weights[[k]]) - } - setcolsnull(weights, delete = weVars, soft = FALSE) - - ## NOTE: weights will be repeated for each level of print, - ## and for each level of print the weights must sum to one for things - ## to work. - weights[, weights := weights/sum(weights)] - - } - - if (!is.data.frame(weights)) { - stop("Something went wrong: 'weights' was not collated into a ", - "data.frame to merge with data. ", - "Blame the package maintainer please!") - } - ## at this points weights is a data.frame. - weights <- data.table(weights) - weights[, weights := as.double(weights)] - - ## ensure repetition by print levels if some adjust levels - ## that exist in weights do not exist in data. - ## NOTE: weights data.frame has at least as many levels as adjust column - ## in data (or it has more sometimes). - wm <- lapply(adVars, function(chStr) { - col <- weights[[chStr]] - if (is.factor(col)) return(levels(col)) - sort(unique(col)) - }) - names(wm) <- adVars - if (length(prVars)) { - wm[prVars] <- lapply(prVars, function(chStr) { - col <- data[[chStr]] - if (is.factor(col)) return(levels(col)) - sort(unique(col)) - }) - } - wm <- do.call(CJ, wm) - setDT(wm) - - weights <- merge(wm, weights, by = adVars, all.x = TRUE, all.y = TRUE) - - byCols <- subsetDTorDF(weights, select = prVars) - if (!length(prVars)) byCols <- NULL - weights[, weights := weights/sum(weights), by = eval(byCols)] - rm(byCols) - - data <- merge(data, weights, by = c(prVars, adVars), - all.x = TRUE, all.y = FALSE) - - if (any(is.na(data$weights))) { - ## should not be any NAs since we checked for level congruence - ## in checkWeights - stop("Internal error: some weights were NA after merging to working ", - "data. Complain to the package maintainer if you see this.") - } - - - } - - setattr(data, "makeWeightsDT", list(prVars = prVars, adVars = adVars, - boVars = boVars, vaVars = vaVars, - NAs = NAs)) - return(data[]) - -} - -checkCharWeights <- function(w) { - if (is.character(w)) { - if (length(w) != 1L) { - stop("weights supplied as a character string must be of length one.") - } - if (!pmatch(w, c("internal", "cohort"), nomatch = 0L)) { - stdr.weights(w) - } - } -} - -checkWeights <- function(weights, adjust) { - ## INTENTION: given a list/DF/vector/string specifying weights - ## and a data.frame/list of the adjusting variables, - ## checks they are congruent and complains if not. - allowed_classes <- c("list","data.frame","integer","numeric","character", - "NULL") - if (!any(class(weights) %in% allowed_classes)) { - stop("weights must be either a list, a data.frame, a numeric variable, ", - "or a character string specifing the weighting scheme to use. ", - "See ?direct_standardization for more information.") - } - - if (is.list(weights) && !is.data.frame(weights) && - length(adjust) != length(weights)) { - stop("Mismatch in numbers of variables (NOT necessarily in the numbers of ", - "levels/values within the variables) in adjust (", length(adjust), - " variables) and weights (", length(weights)," variables); ", - "make sure each given weights vector has a corresponding ", - "variable in adjust and vice versa. ", - "See ?direct_standardization for more information.") - } - - if (is.list(weights)) { - isChar <- unlist(lapply(weights, is.character)) - if (any(isChar)) { - lapply(weights[isChar], checkCharWeights) - weights[isChar] <- lapply(weights[isChar], function(string) { - if (pmatch(string, c("cohort", "internal"), nomatch = 0L)) { - stop("List of weights had 'cohort' or 'internal' as at least one ", - "element, which is currently not supported. ", - "See ?direct_standardization for more information.") - } - stdr.weights(string)[[2]] - }) - } - } - - if (is.character(weights)) { - checkCharWeights(weights) - if (pmatch(weights, c("internal", "cohort"), nomatch = 0L)) { - ## done checking since internal weights are pretty fool-proof. - return(invisible()) - } - ## if not, pass along as vector of weights. - weights <- stdr.weights(weights)[[2]] - } - - if (is.numeric(weights)) { - if (length(adjust) != 1L) { - stop("Weights is a numeric vector of weights, ", - "but there are more or less than one adjusting variable. ", - "See ?direct_standardization for more information.") - } - weights <- list(weights) - names(weights) <- names(adjust) - } - - ## by now either a list or a data.frame of weights... - adVars <- names(adjust) - weVars <- names(weights) - if (is.data.frame(weights)) { - if (!"weights" %in% weVars) { - stop("data.frame of weights did not have column named 'weights'. ", - "see ?direct_standardization for more information.") - } - weVars <- setdiff(weVars, "weights") - } - - badAdVars <- setdiff(adVars, weVars) - badWeVars <- setdiff(weVars, adVars) - if (length(badAdVars) > 0) { - stop("Mismatch in names of variables in adjust and weights; ", - "following adjust variables not mentioned in weights: ", - paste0("'", badAdVars, "'", collapse = ", ")) - } - - if (length(badWeVars) > 0) { - stop("Mismatch in names of variables in adjust and weights; ", - "following weights variables not mentioned in adjust: ", - paste0("'", badWeVars, "'", collapse = ", ")) - } - - if (is.data.frame(weights)) { - - levDiff <- lapply(names(adjust), function(var) { - !all(adjust[[var]] %in% weights[[var]]) - }) - levDiff <- unlist(levDiff) - if (any(levDiff)) { - ## take only first conflicting variable for brevity of error message- - badVar <- names(adjust)[1] - badLevs <- setdiff(adjust[[badVar]], weights[[badVar]]) - badLevs <- paste0("'", badLevs, "'", collapse = ", ") - stop("Missing levels in weights data.frame in variable '", badVar, "': ", - badLevs, ". These levels were found to exist in the corresponding ", - "adjusting variable. ", - "Usual suspects: adjusting variable is a factor and you ", - "only supplied weights for unique values in your data ", - "as opposed to the levels of the factor, which may contain levels ", - "that no row has. Try table(yourdata$yourvariable).") - } - - } else { - weights <- as.list(weights) - weights <- weights[adVars] - - ## check variable levels - adjust <- lapply(adjust, function(elem) { - if (is.factor(elem)) { - levels(elem) - } else { - sort(unique(elem)) - } - }) - - weLen <- unlist(lapply(weights, length)) - adLen <- unlist(lapply(adjust, length)) - badLen <- names(adjust)[weLen != adLen] - - if (length(badLen) > 0) { - stop("Mismatch in numbers of levels/unique values in adjusting variables ", - "and lengths of corresponding weights vectors. ", - "Names of mismatching variables: ", - paste0("'", badLen, "'", collapse = ", "), ". There were ", - weLen[weLen != adLen], " weights and ", adLen[weLen != adLen], - " adjusting variable levels.") - } - } - - - - invisible() -} - - - - - - - - +#' @title Make a \code{data.table} of Tabulated, Aggregated Values and Weights +#' @description An internal function that aggregates a table +#' and merges in weights. +#' @param data DF/DT; passed to \code{envir} in \code{eval} +#' @param values values to tabulate. Anything \code{evalPopArg} can evaluate. +#' @param print variables to tabulate by and include in \code{prVars} in attributes +#' @param adjust variables to tabulate by and include in \code{adVars} in attributes +#' @param formula a formula such as \code{fot ~ sex} or \code{Surv(fot, lex.Xst) ~ sex} +#' @param Surv.response logical, if \code{TRUE} throws error if response in +#' \code{formula} is not a \code{Surv} object and vice versa +#' @param by.other other variables to tabulate by and include +#' in \code{boVars} in attributes +#' @param custom.levels a named list of values. When "inflating" the data +#' in the cross-join / cartesian join sense (try e.g. \code{merge(1:5, 1:2)}), +#' one can supply the levels to inflate by using this to ensure inflation is full. +#' E.g. data might only have levels present to do inflation analogous to +#' \code{merge(2:5, 1:2)} although \code{merge(1:5, 1:2)} is intended and +#' needed. +#' @param custom.levels.cut.low a character string vector of variable names. +#' These variables mentioned in \code{custom.levels} and existing in data +#' or first modified (in data) using \code{cutLow()} (essentially +#' \code{cut()} with \code{right = FALSE} and returning the lower bounds +#' as values). Handy for aggregating data e.g. to survival intervals. +#' \strong{NOTE}: the appropriate elements in \code{custom.levels} for these +#' variables must exceptionally contain an extra value as the roof used in +#' cutting, which will not be used in "inflating" the table using a merge. +#' See Examples. +#' @param weights a named list or long-form data.frame of weights. See Examples. +#' @param internal.weights.values the variable to use to compute internal +#' weights; only used if \code{weights = "internal"}. +#' @param enclos the enclosing environment passed on to \code{eval}. Variables +#' not found in \code{data} or searched for here. +#' @param NA.text a character string to display in a \code{warning} +#' if there are any rows with missing \code{values} or \code{adjust} values. +#' \strong{special:} key phrase \code{\%\%NA_COUNT\%\%} in text is replaced +#' with the count of missing observations. +#' E.g. \code{"Missing \%\%NA_COUNTS\%\% observations due to derpness."} +#' @examples +#' library(survival) +#' library(data.table) +#' +#' makeWeightsDT <- popEpi:::makeWeightsDT ## this avoids errors during tests +#' +#' sire <- copy(popEpi::sire) +#' set.seed(1L) +#' sire$sex <- rbinom(nrow(sire), 1, 0.5) +#' ag <- lexpand(sire, birth = "bi_date", entry = "dg_date", exit = "ex_date", +#' status = status %in% 1:2, pophaz = popmort, pp = FALSE, +#' aggre = list(sex, agegr = cut(dg_age, c(0,50,75,Inf)), fot), +#' fot = seq(0, 5, 1/12)) +#' ps <- quote(list(sex, fot)) +#' as <- quote(list(agegr)) +#' vs <- list(quote(list(pyrs, at.risk))) +#' ws <- list(agegr = c(0.2,0.4,0.4)) +#' +#' #### custom.levels usage +#' fb <- seq(0, 5-1/12, 1/12) ## exclude 5 as no row has that value +#' ag2 <- ag[fot > 0.5,] +#' # repeats fot intervals < 0.5 as empty rows +#' # may be the safest way to do this +#' dt <- makeWeightsDT(ag2, print = ps, adjust = as, +#' values = vs, weights = ws, +#' custom.levels = list(fot = fb)) +#' ## aggregate from intervals seq(0, 5, 1/12) to 0:5 +#' fb2 <- 0:5 ## (this time we include 5 as the roof) +#' dt <- makeWeightsDT(ag2, print = ps, adjust = as, +#' values = vs, weights = ws, +#' custom.levels = list(fot = fb2), +#' custom.levels.cut.low = "fot") +#' +#' +#' #### use of enclos +#' TF <- environment() +#' gender <- factor(ag$sex) +#' dt <- makeWeightsDT(ag, print = quote(gender), adjust = as, +#' values = vs, weights = ws, enclos = TF) +#' ## or NULL: uses calling frame by default. +#' dt <- makeWeightsDT(ag, print = quote(gender), adjust = as, +#' values = vs, weights = ws, +#' enclos = NULL) +#' ## passing parent.fram(1) is the same thing (as below), +#' ## but won't pass in testing these examples somehow (but work in real life) +#' # dt <- makeWeightsDT(ag, print = quote(gender), adjust = as, +#' # values = vs, weights = ws, +#' # enclos = NULL) +#' +#' #### formula usage +#' form <- Surv(fot, factor(from0to1))~gender +#' dt <- makeWeightsDT(ag, formula = form, Surv.response = TRUE, +#' adjust = as, values = vs, weights = ws, +#' enclos = NULL) +#' +#' ## or +#' form <- Surv(fot, factor(from0to1))~gender + adjust(agegr) +#' dt <- makeWeightsDT(ag, formula = form, Surv.response = TRUE, +#' adjust = NULL, values = vs, weights = ws, +#' enclos = NULL) +#' +#' ## or +#' form <- from0to1 ~ fot + gender + adjust(agegr) +#' dt <- makeWeightsDT(ag, formula = form, Surv.response = FALSE, +#' adjust = NULL, values = vs, weights = ws, +#' enclos = NULL) +#' +#' form <- from0to1 ~ fot + adjust(agegr) + adjust(sex) +#' ws2 <- list(agegr = c(0.33, 0.33, 0.33), sex = c(0.5, 0.5)) +#' dt <- makeWeightsDT(ag, formula = form, Surv.response = FALSE, +#' adjust = NULL, values = vs, weights = ws2, +#' enclos = NULL) +#' +#' ## international standard pops +#' ag <- lexpand(sire, birth = "bi_date", entry = "dg_date", exit = "ex_date", +#' status = status %in% 1:2, pophaz = popmort, pp = FALSE, +#' aggre = list(sex, agegr = cut(dg_age, c(seq(0, 85, 5), Inf)), fot), +#' fot = seq(0, 5, 1/12)) +#' +#' form <- from0to1 ~ fot + adjust(agegr) +#' dt <- makeWeightsDT(ag, formula = form, Surv.response = FALSE, +#' adjust = NULL, values = vs, weights = "world_1966_18of5", +#' enclos = NULL) +#' +#' form <- from0to1 ~ fot + adjust(agegr, sex) +#' dt <- makeWeightsDT(ag, formula = form, Surv.response = FALSE, +#' adjust = NULL, values = vs, +#' weights = list(agegr = "nordic_2000_18of5", sex=c(1,1)), +#' enclos = NULL) +makeWeightsDT <- function(data, values = NULL, + print = NULL, adjust = NULL, + formula = NULL, Surv.response = TRUE, + by.other = NULL, custom.levels = NULL, + custom.levels.cut.low = NULL, weights = NULL, + internal.weights.values = NULL, + enclos = NULL, NA.text = NULL) { + + # environmentalism ----------------------------------------------------------- + TF <- environment() + PF <- parent.frame(1L) + if (missing(enclos) || is.null(enclos)) { + enclos <- PF + } + + enclos <- eval(enclos, envir = TF) + + if (!is.environment(enclos)) { + stop("Argument 'enclos' is not an environment. (Probably internal error, ", + "meaning you should complain to the package maintainer if you are not", + "doing something silly.)") + } + + THIS_CALL <- match.call() + + ## dataism ------------------------------------------------------------------- + if (!is.data.frame(data)) stop("data must be a data.frame") + ## tmpDum for convenience. will be deleted in the end. (if no tabulating vars) + origData <- data + tmpDum <- makeTempVarName(origData, pre = "dummy_") + data <- data.table(rep(1L, nrow(origData))) + setnames(data, 1, tmpDum) + + + # formula: vars to print and adjust by --------------------------------------- + + adSub <- adjust + adjust <- evalPopArg(data = origData, arg = adSub, DT = TRUE, enclos = enclos) + + if (!is.null(formula)) { + + foList <- usePopFormula(formula, adjust = adjust, + data = origData, enclos = enclos, + Surv.response = Surv.response) + print <- foList$print + adjust <- foList$adjust + } else { + + prSub <- substitute(print) + print <- evalPopArg(data = origData, arg = prSub, DT = TRUE, enclos = enclos) + + + } + + if (length(weights) && length(adjust)) { + checkWeights(weights, adjust = adjust) + } + + # variables to print by ---------------------------------------------------- + prVars <- tmpDum + if (length(print) > 0) { + prVars <- names(print) + data[, c(prVars) := TF$print] + data[, c(tmpDum) := NULL] + } + rm(print) + + # standardization ---------------------------------------------------------- + ## note: adjust evaluated above with formula + adVars <- NULL + if (length(adjust) > 0) { + adVars <- names(adjust) + data[, c(adVars) := TF$adjust] + } + rm(adjust) + + if (is.null(weights) && length(adVars)) { + stop("Variables to adjust by were defined but no weights were supplied.") + } + + if (!length(adVars)) { + + if (!is.null(weights)) { + message("NOTE: Weights ignored since no adjusting variables given") + } + + weights <- NULL + } + + # variables to sum ----------------------------------------------------------- + if (!is.list(values)) stop("Argument 'values' must be a list ", + "(internal error: complain to the package", + "maintainer if you see this)") + values <- lapply(values, function(x) { + evalPopArg(data = origData, arg = x, DT = TRUE, enclos = enclos) + }) + for (dt in setdiff(seq_along(values), 1L)) { + values[[1L]] <- cbind(values[[1L]], values[[dt]]) + } + values <- values[[1L]] + vaVars <- NULL + if (nrow(values) != nrow(data)) { + stop("mismatch in numers of rows in data (", nrow(data), + ") and 'values' (", nrow(values), "). If you see this message, ", + "complain to the package maintainer.") + } + + if (length(values) > 0) { + vaVars <- names(values) + data[, c(vaVars) := TF$values] + } else { + stop("no values given to sum!") + } + rm(values) + + # additionally, values to compute internal weights by: ----------------------- + iwVar <- NULL + if (is.character(weights) && + pmatch(weights, c("internal", "cohort"), nomatch = 0L)) { + iw <- substitute(internal.weights.values) + iw <- evalPopArg(data = origData, iw, DT = TRUE, + enclos = PF, recursive = TRUE, + types = c("character", "expression", "list", "NULL")) + + if (length(iw) > 1L) stop("Argument 'internal.weights.values' ", + "must produce only one column.") + if (length(iw) == 1L && is.character(weights) && + pmatch(weights, c("internal", "cohort"), nomatch = 0L)) { + iwVar <- makeTempVarName(names=c(names(data), names(origData)), pre = "iw_") + data[, c(iwVar) := TF$iw] + } + + if (length(iwVar) == 0L) { + stop("Requested computing internal weights, but no values to compute ", + "internals weights with were supplied (internal error: If you see ", + "this, complain to the package maintainer).") + } + rm(iw) + } + + + # other category vars to keep ------------------------------------------------ + boSub <- by.other + by.other <- evalPopArg(data = origData, arg = boSub, DT = TRUE, enclos = enclos) + boVars <- NULL + if (length(by.other) > 0) { + boVars <- names(by.other) + data[, c(boVars) := TF$by.other] + } + rm(by.other) + + # check for aliased columns -------------------------------------------------- + aliased_cols(data, cols = c(prVars, adVars, boVars)) + + # check for conflicting column names ----------------------------------------- + dupCols <- c(prVars, adVars, boVars, vaVars, iwVar) + dupCols <- unique(dupCols[duplicated(dupCols)]) + if (length(dupCols) > 0L) { + dupCols <- paste0("'", dupCols, "'", collapse = ", ") + stop("Following column names duplicated (columns created by arguments ", + "print, adjust, etc.): ", dupCols, ". If you see this, please ensure ", + "you are not passing e.g. the same column to both for adjusting ", + "and stratification (printing).") + } + + # check for NA values -------------------------------------------------------- + ## NOTE: NA values of print/by.other are OK. values/adjust are NOT. + + NAs <- data[, lapply(.SD, function(x) is.na(x)), .SDcols = c(vaVars, iwVar, adVars)] + NAs <- rowSums(NAs) > 0L + if (sum(NAs)) { + if (!is.null(NA.text)) { + NA.text <- gsub(x = NA.text, pattern = "%%NA_COUNT%%", + replacement = sum(NAs)) + warning(NA.text) + } + data <- data[!NAs] + } + + # inflate data --------------------------------------------------------------- + ## on the other hand we aggregate data to levels of print, adjust and + ## by.other; on the other hand the data will always have tabulating variables + ## represented as cross-joined, e.g. merge(1:5, 1:2). + ## this means some rows might have zeros as values in the 'values' + ## columns. + ## (necessary for correct standardization with weights) + + ## NOTE: have to do CJ by hand: some levels of adjust or something may not + ## have each level of e.g. fot repeated! + + sortedLevs <- function(x) { + if (!is.factor(x)) return(sort(unique(x))) + + factor(levels(x), levels(x), levels(x)) + } + cj <- list() + cj <- lapply(data[, .SD, .SDcols = c(prVars, adVars, boVars)], sortedLevs) + + ## e.g. if data only has fot = seq(0, 4, 1/12), but want to display table + ## with fot = seq(0, 5, 1/12). Very important sometimes for handy usage + ## of weights. + if (length(custom.levels) > 0) cj[names(custom.levels)] <- custom.levels + + + ## SPECIAL: if e.g. a survival time scale with breaks seq(0, 5, 1/12) + ## is to be "compressed" to breaks 0:5, and the latter breaks were passed + ## via custom.levels, the following ensures e.g. intervals between 0 and 1 + ## are aggregated to the same row in what follows after. + if (!is.null(custom.levels.cut.low)) { + cl_msg <- paste0("Internal error: tried to cut() variables in ", + "internally used work data that did not exist. ", + "If you see this, complain to the ", + "package maintainer. Bad variables: %%VARS%%.") + all_names_present(data, custom.levels.cut.low, msg = cl_msg) + all_names_present(cj, custom.levels.cut.low, msg = cl_msg) + + for (var in custom.levels.cut.low) { + set(data, j = var, value = cutLow(data[[var]], breaks = cj[[var]])) + } + + ## NOTE: if used cutlow(), then assume passed values via custom.levels + ## also contained the roof of the values which should not be repeated. + cj[custom.levels.cut.low] <- lapply(cj[custom.levels.cut.low], + function(elem) elem[-length(elem)]) + } + + ## form data.table to merge by - the merge will inflate the data. + cj <- do.call(function(...) CJ(..., unique = FALSE, sorted = FALSE), cj) + + ## inflate & aggregate. + setkeyv(data, c(prVars, adVars, boVars)) + data <- data[cj, lapply(.SD, sum), .SDcols = c(vaVars, iwVar), by = .EACHI] + + for (k in c(vaVars, iwVar)) { + data[is.na(get(k)), (k) := 0] + } + + setcolsnull(data, tmpDum) + prVars <- setdiff(prVars, tmpDum); if (length(prVars) == 0) prVars <- NULL + + ## merge in weights ---------------------------------------------------------- + + if (!is.null(weights)) { + + if (is.list(weights) && !is.data.frame(weights)) { + ## in case one of the elements is a standardization scheme string, + ## such as "world_x_y". + whChar <- which(unlist(lapply(weights, is.character))) + if (sum(whChar)) { + + weights[whChar] <- lapply(weights[whChar], function(string) { + ## expected to return data.frame with 1) age groups 2) weights + ## as columns. + stdr.weights(string)[[2]] + }) + + } + + ## now list only contains numeric weights i hope. + + } + + ## NOTE: adjust used here to contain levels of adjust arguments only + adjust <- list() + if (length(adVars) > 0L) { + adjust <- lapply(data[, eval(adVars), with = FALSE], sortedLevs) + } + + if (is.character(weights)) { + + if (pmatch(weights, c("internal", "cohort"), nomatch = 0L)) { + + + all_names_present(data, iwVar, + msg = paste0( + "Internal error: expected to have variable ", + "%%VARS%% in working data but didn't. Complain ", + "to the pkg maintainer if you see this." + )) + + weights <- lapply(seq_along(adjust), function(i) { + cn <- names(adjust)[i] + le <- unique(adjust[[i]]) + le <- structure(list(le), names = cn) + data[le, lapply(.SD, sum), .SDcols = iwVar, by = .EACHI, on = cn][[iwVar]] + }) + names(weights) <- names(adjust) + + + setcolsnull(data, iwVar) + setkeyv(data, c(prVars, adVars, boVars)) + } else { + ## expected to return data.frame with 1) age groups 2) weights + ## as columns. + weights <- stdr.weights(weights) + weights <- weights[[2]] + + } + } + + if (!is.data.frame(weights) && is.vector(weights)) { + ## note: lists are vectors + if (!is.list(weights)) { + weights <- list(weights) ## was a vector of values + setattr(weights, "names", adVars[1]) + } + + weVars <- names(weights) + weights <- weights[names(adjust)] + + adjust <- do.call(function(...) CJ(..., unique = FALSE, sorted = FALSE), adjust) + weights <- do.call(function(...) CJ(..., unique = FALSE, sorted = FALSE), weights) + + weVars <- paste0(weVars, ".w") + setnames(weights, adVars, weVars) + weights[, (adVars) := adjust] + + set(weights, j = "weights", value = 1L) + for (k in weVars) { + set(weights, j = "weights", value = weights$weights * weights[[k]]) + } + setcolsnull(weights, delete = weVars, soft = FALSE) + + ## NOTE: weights will be repeated for each level of print, + ## and for each level of print the weights must sum to one for things + ## to work. + weights[, "weights" := weights/sum(weights)] + + } + + if (!is.data.frame(weights)) { + stop("Something went wrong: 'weights' was not collated into a ", + "data.frame to merge with data. ", + "Blame the package maintainer please!") + } + ## at this points weights is a data.frame. + weights <- data.table(weights) + weights[, "weights" := as.double(weights)] + + ## ensure repetition by print levels if some adjust levels + ## that exist in weights do not exist in data. + ## NOTE: weights data.frame has at least as many levels as adjust column + ## in data (or it has more sometimes). + wm <- lapply(adVars, function(chStr) { + col <- weights[[chStr]] + if (is.factor(col)) return(levels(col)) + sort(unique(col)) + }) + names(wm) <- adVars + if (length(prVars)) { + wm[prVars] <- lapply(prVars, function(chStr) { + col <- data[[chStr]] + if (is.factor(col)) return(levels(col)) + sort(unique(col)) + }) + } + wm <- setDT(do.call(CJ, wm)) + + weights <- merge(wm, weights, by = adVars, all.x = TRUE, all.y = TRUE) + + dt_robust_by( + 'weights[, "weights" := weights/sum(weights), by = %%BY_VAR_NMS%%]', + by.var.nms = prVars + ) + + data[i = weights, on = c(prVars, adVars), j = "weights" := weights] + + if (any(is.na(data$weights))) { + ## should not be any NAs since we checked for level congruence + ## in checkWeights + stop("Internal error: some weights were NA after merging to working ", + "data. Complain to the package maintainer if you see this.") + } + + + } + + setattr(data, "makeWeightsDT", list(prVars = prVars, adVars = adVars, + boVars = boVars, vaVars = vaVars, + NAs = NAs)) + return(data[]) + +} + +checkCharWeights <- function(w) { + if (is.character(w)) { + if (length(w) != 1L) { + stop("weights supplied as a character string must be of length one.") + } + if (!pmatch(w, c("internal", "cohort"), nomatch = 0L)) { + stdr.weights(w) + } + } +} + +checkWeights <- function(weights, adjust) { + ## INTENTION: given a list/DF/vector/string specifying weights + ## and a data.frame/list of the adjusting variables, + ## checks they are congruent and complains if not. + allowed_classes <- c("list","data.frame","integer","numeric","character", + "NULL") + if (!any(class(weights) %in% allowed_classes)) { + stop("weights must be either a list, a data.frame, a numeric variable, ", + "or a character string specifing the weighting scheme to use. ", + "See ?direct_standardization for more information.") + } + + if (is.list(weights) && !is.data.frame(weights) && + length(adjust) != length(weights)) { + stop("Mismatch in numbers of variables (NOT necessarily in the numbers of ", + "levels/values within the variables) in adjust (", length(adjust), + " variables) and weights (", length(weights)," variables); ", + "make sure each given weights vector has a corresponding ", + "variable in adjust and vice versa. ", + "See ?direct_standardization for more information.") + } + + if (is.list(weights)) { + isChar <- unlist(lapply(weights, is.character)) + if (any(isChar)) { + lapply(weights[isChar], checkCharWeights) + weights[isChar] <- lapply(weights[isChar], function(string) { + if (pmatch(string, c("cohort", "internal"), nomatch = 0L)) { + stop("List of weights had 'cohort' or 'internal' as at least one ", + "element, which is currently not supported. ", + "See ?direct_standardization for more information.") + } + stdr.weights(string)[[2]] + }) + } + } + + if (is.character(weights)) { + checkCharWeights(weights) + if (pmatch(weights, c("internal", "cohort"), nomatch = 0L)) { + ## done checking since internal weights are pretty fool-proof. + return(invisible()) + } + ## if not, pass along as vector of weights. + weights <- stdr.weights(weights)[[2]] + } + + if (is.numeric(weights)) { + if (length(adjust) != 1L) { + stop("Weights is a numeric vector of weights, ", + "but there are more or less than one adjusting variable. ", + "See ?direct_standardization for more information.") + } + weights <- list(weights) + names(weights) <- names(adjust) + } + + ## by now either a list or a data.frame of weights... + adVars <- names(adjust) + weVars <- names(weights) + if (is.data.frame(weights)) { + if (!"weights" %in% weVars) { + stop("data.frame of weights did not have column named 'weights'. ", + "see ?direct_standardization for more information.") + } + weVars <- setdiff(weVars, "weights") + } + + badAdVars <- setdiff(adVars, weVars) + badWeVars <- setdiff(weVars, adVars) + if (length(badAdVars) > 0) { + stop("Mismatch in names of variables in adjust and weights; ", + "following adjust variables not mentioned in weights: ", + paste0("'", badAdVars, "'", collapse = ", ")) + } + + if (length(badWeVars) > 0) { + stop("Mismatch in names of variables in adjust and weights; ", + "following weights variables not mentioned in adjust: ", + paste0("'", badWeVars, "'", collapse = ", ")) + } + + if (is.data.frame(weights)) { + + levDiff <- lapply(names(adjust), function(var) { + !all(adjust[[var]] %in% weights[[var]]) + }) + levDiff <- unlist(levDiff) + if (any(levDiff)) { + ## take only first conflicting variable for brevity of error message- + badVar <- names(adjust)[1] + badLevs <- setdiff(adjust[[badVar]], weights[[badVar]]) + badLevs <- paste0("'", badLevs, "'", collapse = ", ") + stop("Missing levels in weights data.frame in variable '", badVar, "': ", + badLevs, ". These levels were found to exist in the corresponding ", + "adjusting variable. ", + "Usual suspects: adjusting variable is a factor and you ", + "only supplied weights for unique values in your data ", + "as opposed to the levels of the factor, which may contain levels ", + "that no row has. Try table(yourdata$yourvariable).") + } + + } else { + weights <- as.list(weights) + weights <- weights[adVars] + + ## check variable levels + adjust <- lapply(adjust, function(elem) { + if (is.factor(elem)) { + levels(elem) + } else { + sort(unique(elem)) + } + }) + + weLen <- unlist(lapply(weights, length)) + adLen <- unlist(lapply(adjust, length)) + badLen <- names(adjust)[weLen != adLen] + + if (length(badLen) > 0) { + stop("Mismatch in numbers of levels/unique values in adjusting variables ", + "and lengths of corresponding weights vectors. ", + "Names of mismatching variables: ", + paste0("'", badLen, "'", collapse = ", "), ". There were ", + weLen[weLen != adLen], " weights and ", adLen[weLen != adLen], + " adjusting variable levels.") + } + } + + + + invisible() +} + + + + + + + + diff --git a/README.md b/README.md index 0062d1c..4a1e17a 100644 --- a/README.md +++ b/README.md @@ -1,189 +1,189 @@ -[![Build Status](https://travis-ci.org/WetRobot/popEpi.png?branch=master)](https://travis-ci.org/WetRobot/popEpi) [![AppVeyor Build Status](https://ci.appveyor.com/api/projects/status/github/WetRobot/popEpi?branch=master&svg=true)](https://ci.appveyor.com/project/WetRobot/popepi) [![CRAN\_Status\_Badge](http://www.r-pkg.org/badges/version/popEpi)](https://cran.r-project.org/package=popEpi) [![codecov.io](http://codecov.io/github/WetRobot/popEpi/coverage.svg?branch=master)](http://codecov.io/github/WetRobot/popEpi?branch=master) [![CRAN\_DLs\_via\_RStudio](http://cranlogs.r-pkg.org/badges/popEpi)](https://cran.r-project.org/package=popEpi) - -popEpi: Epidemiology with population data -========================================= - -The purpose of popEpi is to facilitate computing certain epidemiological statistics where population data is used. Current main attractions: - -Splitting, merging population hazards, and aggregating ------------------------------------------------------- - -the `lexpand` function allows users to split their subject-level follow-up data into sub-intervals along age, follow-up time and calendar time, merge corresponding population hazard information to those intervals, and to aggregate the resulting data if needed. - -``` r -data(sire) -sr <- sire[1,] -print(sr) -#> sex bi_date dg_date ex_date status dg_age -#> 1: 1 1952-05-27 1994-02-03 2012-12-31 0 41.68877 -``` - -``` r -x <- lexpand(sr, birth = bi_date, entry = dg_date, exit = ex_date, - status = status %in% 1:2, - fot = 0:5, per = 1994:2000) -print(x) -#> lex.id fot per age lex.dur lex.Cst lex.Xst sex -#> 1: 1 0.000000 1994.09 41.68877 0.90958904 0 0 1 -#> 2: 1 0.909589 1995.00 42.59836 0.09041096 0 0 1 -#> 3: 1 1.000000 1995.09 42.68877 0.90958904 0 0 1 -#> 4: 1 1.909589 1996.00 43.59836 0.09041096 0 0 1 -#> 5: 1 2.000000 1996.09 43.68877 0.90958904 0 0 1 -#> 6: 1 2.909589 1997.00 44.59836 0.09041096 0 0 1 -#> 7: 1 3.000000 1997.09 44.68877 0.90958904 0 0 1 -#> 8: 1 3.909589 1998.00 45.59836 0.09041096 0 0 1 -#> 9: 1 4.000000 1998.09 45.68877 0.90958904 0 0 1 -#> 10: 1 4.909589 1999.00 46.59836 0.09041096 0 0 1 -#> bi_date dg_date ex_date status dg_age -#> 1: 1952-05-27 1994-02-03 2012-12-31 0 41.68877 -#> 2: 1952-05-27 1994-02-03 2012-12-31 0 41.68877 -#> 3: 1952-05-27 1994-02-03 2012-12-31 0 41.68877 -#> 4: 1952-05-27 1994-02-03 2012-12-31 0 41.68877 -#> 5: 1952-05-27 1994-02-03 2012-12-31 0 41.68877 -#> 6: 1952-05-27 1994-02-03 2012-12-31 0 41.68877 -#> 7: 1952-05-27 1994-02-03 2012-12-31 0 41.68877 -#> 8: 1952-05-27 1994-02-03 2012-12-31 0 41.68877 -#> 9: 1952-05-27 1994-02-03 2012-12-31 0 41.68877 -#> 10: 1952-05-27 1994-02-03 2012-12-31 0 41.68877 -``` - -``` r -data(popmort) -x <- lexpand(sr, birth = bi_date, entry = dg_date, exit = ex_date, - status = status %in% 1:2, - fot = 0:5, per = 1994:2000, pophaz = popmort) -print(x) -#> lex.id fot per age lex.dur lex.Cst lex.Xst sex -#> 1: 1 0.000000 1994.09 41.68877 0.90958904 0 0 1 -#> 2: 1 0.909589 1995.00 42.59836 0.09041096 0 0 1 -#> 3: 1 1.000000 1995.09 42.68877 0.90958904 0 0 1 -#> 4: 1 1.909589 1996.00 43.59836 0.09041096 0 0 1 -#> 5: 1 2.000000 1996.09 43.68877 0.90958904 0 0 1 -#> 6: 1 2.909589 1997.00 44.59836 0.09041096 0 0 1 -#> 7: 1 3.000000 1997.09 44.68877 0.90958904 0 0 1 -#> 8: 1 3.909589 1998.00 45.59836 0.09041096 0 0 1 -#> 9: 1 4.000000 1998.09 45.68877 0.90958904 0 0 1 -#> 10: 1 4.909589 1999.00 46.59836 0.09041096 0 0 1 -#> bi_date dg_date ex_date status dg_age pop.haz pp -#> 1: 1952-05-27 1994-02-03 2012-12-31 0 41.68877 0.001170685 1.000651 -#> 2: 1952-05-27 1994-02-03 2012-12-31 0 41.68877 0.001441038 1.000651 -#> 3: 1952-05-27 1994-02-03 2012-12-31 0 41.68877 0.001200721 1.001856 -#> 4: 1952-05-27 1994-02-03 2012-12-31 0 41.68877 0.001300846 1.001856 -#> 5: 1952-05-27 1994-02-03 2012-12-31 0 41.68877 0.001400981 1.003207 -#> 6: 1952-05-27 1994-02-03 2012-12-31 0 41.68877 0.002142293 1.003207 -#> 7: 1952-05-27 1994-02-03 2012-12-31 0 41.68877 0.002202424 1.005067 -#> 8: 1952-05-27 1994-02-03 2012-12-31 0 41.68877 0.001771568 1.005067 -#> 9: 1952-05-27 1994-02-03 2012-12-31 0 41.68877 0.002222468 1.007277 -#> 10: 1952-05-27 1994-02-03 2012-12-31 0 41.68877 0.002282603 1.007277 -``` - -``` r -a <- lexpand(sr, birth = bi_date, entry = dg_date, exit = ex_date, - status = status %in% 1:2, - fot = 0:5, per = 1994:2000, aggre = list(fot, per)) -print(a) -#> fot per pyrs at.risk from0to0 -#> 1: 0 1994 0.90958904 0 0 -#> 2: 0 1995 0.09041096 1 0 -#> 3: 1 1995 0.90958904 0 0 -#> 4: 1 1996 0.09041096 1 0 -#> 5: 2 1996 0.90958904 0 0 -#> 6: 2 1997 0.09041096 1 0 -#> 7: 3 1997 0.90958904 0 0 -#> 8: 3 1998 0.09041096 1 0 -#> 9: 4 1998 0.90958904 0 0 -#> 10: 4 1999 0.09041096 1 1 -``` - -SIRs / SMRs ------------ - -One can make use of the `sir` function to estimate indirectly standardised incidence or mortality ratios (SIRs/SMRs). The data can be aggregated by `lexpand` or by other means. While `sir` is simple and flexible in itself, one may also use `sirspline` to fit spline functions for the effect of e.g. age as a continuous variable on SIRs. - -``` r -data(popmort) -data(sire) -c <- lexpand( sire, status = status %in% 1:2, birth = bi_date, exit = ex_date, entry = dg_date, - breaks = list(per = 1950:2013, age = 1:100, fot = c(0,10,20,Inf)), - aggre = list(fot, agegroup = age, year = per, sex) ) -#> dropped 16 rows where entry == exit - -se <- sir( coh.data = c, coh.obs = 'from0to1', coh.pyrs = 'pyrs', - ref.data = popmort, ref.rate = 'haz', - adjust = c('agegroup', 'year', 'sex'), print = 'fot') -se -#> SIR (adjusted by agegroup, year, sex) with 95% confidence intervals (profile) -#> Test for homogeneity: p < 0.001 -#> -#> Total sir: 3.08 (2.99-3.17) -#> Total observed: 4559 -#> Total expected: 1482.13 -#> Total person-years: 39906 -#> -#> -#> fot observed expected pyrs sir sir.lo sir.hi p_value -#> 1: 0 4264 1214.54 34445.96 3.51 3.41 3.62 0.000 -#> 2: 10 295 267.59 5459.96 1.10 0.98 1.23 0.094 -``` - -(Relative) survival -------------------- - -The `survtab` function computes observed, net/relative and cause-specific survivals as well as cumulative incidence functions for `Lexis` data. Any of the supported survival time functions can be easily adjusted by any number of categorical variables if needed. - -One can also use `survtab_ag` for aggregated data. This means the data does not have to be on the subject-level to compute survival time function estimates. - -``` r -library(Epi) -#> -#> Attaching package: 'Epi' -#> The following object is masked from 'package:base': -#> -#> merge.data.frame - -data(sibr) -sire$cancer <- "rectal" -sibr$cancer <- "breast" -sr <- rbind(sire, sibr) - -sr$cancer <- factor(sr$cancer) -sr <- sr[sr$dg_date < sr$ex_date, ] - -sr$status <- factor(sr$status, levels = 0:2, - labels = c("alive", "canD", "othD")) - -x <- Lexis(entry = list(FUT = 0, AGE = dg_age, CAL = get.yrs(dg_date)), - exit = list(CAL = get.yrs(ex_date)), - data = sr, - exit.status = status) -#> NOTE: entry.status has been set to "alive" for all. - -st <- survtab(FUT ~ cancer, data = x, - breaks = list(FUT = seq(0, 5, 1/12)), - surv.type = "cif.obs") -st -#> -#> Call: -#> survtab(formula = FUT ~ cancer, data = x, breaks = list(FUT = seq(0, 5, 1/12)), surv.type = "cif.obs") -#> -#> Type arguments: -#> surv.type: cif.obs --- surv.method: hazard -#> -#> Confidence interval arguments: -#> level: 95 % --- transformation: log-log -#> -#> Totals: -#> person-time:62120 --- events: 5375 -#> -#> Stratified by: 'cancer' -#> cancer Tstop surv.obs.lo surv.obs surv.obs.hi SE.surv.obs CIF_canD -#> 1: breast 2.5 0.8804 0.8870 0.8933 0.003290 0.0687 -#> 2: breast 5.0 0.7899 0.7986 0.8070 0.004368 0.1162 -#> 3: rectal 2.5 0.6250 0.6359 0.6465 0.005480 0.2981 -#> 4: rectal 5.0 0.5032 0.5148 0.5263 0.005901 0.3727 -#> CIF_othD -#> 1: 0.0442 -#> 2: 0.0852 -#> 3: 0.0660 -#> 4: 0.1125 -``` +[![Build Status](https://travis-ci.org/WetRobot/popEpi.png?branch=master)](https://travis-ci.org/WetRobot/popEpi) [![AppVeyor Build Status](https://ci.appveyor.com/api/projects/status/github/WetRobot/popEpi?branch=master&svg=true)](https://ci.appveyor.com/project/WetRobot/popepi) [![CRAN\_Status\_Badge](http://www.r-pkg.org/badges/version/popEpi)](https://cran.r-project.org/package=popEpi) [![codecov.io](http://codecov.io/github/WetRobot/popEpi/coverage.svg?branch=master)](http://codecov.io/github/WetRobot/popEpi?branch=master) [![CRAN\_DLs\_via\_RStudio](http://cranlogs.r-pkg.org/badges/popEpi)](https://cran.r-project.org/package=popEpi) + +popEpi: Epidemiology with population data +========================================= + +The purpose of popEpi is to facilitate computing certain epidemiological statistics where population data is used. Current main attractions: + +Splitting, merging population hazards, and aggregating +------------------------------------------------------ + +the `lexpand` function allows users to split their subject-level follow-up data into sub-intervals along age, follow-up time and calendar time, merge corresponding population hazard information to those intervals, and to aggregate the resulting data if needed. + +``` r +data(sire) +sr <- sire[1,] +print(sr) +#> sex bi_date dg_date ex_date status dg_age +#> 1: 1 1952-05-27 1994-02-03 2012-12-31 0 41.68877 +``` + +``` r +x <- lexpand(sr, birth = bi_date, entry = dg_date, exit = ex_date, + status = status %in% 1:2, + fot = 0:5, per = 1994:2000) +print(x) +#> lex.id fot per age lex.dur lex.Cst lex.Xst sex +#> 1: 1 0.000000 1994.09 41.68877 0.90958904 0 0 1 +#> 2: 1 0.909589 1995.00 42.59836 0.09041096 0 0 1 +#> 3: 1 1.000000 1995.09 42.68877 0.90958904 0 0 1 +#> 4: 1 1.909589 1996.00 43.59836 0.09041096 0 0 1 +#> 5: 1 2.000000 1996.09 43.68877 0.90958904 0 0 1 +#> 6: 1 2.909589 1997.00 44.59836 0.09041096 0 0 1 +#> 7: 1 3.000000 1997.09 44.68877 0.90958904 0 0 1 +#> 8: 1 3.909589 1998.00 45.59836 0.09041096 0 0 1 +#> 9: 1 4.000000 1998.09 45.68877 0.90958904 0 0 1 +#> 10: 1 4.909589 1999.00 46.59836 0.09041096 0 0 1 +#> bi_date dg_date ex_date status dg_age +#> 1: 1952-05-27 1994-02-03 2012-12-31 0 41.68877 +#> 2: 1952-05-27 1994-02-03 2012-12-31 0 41.68877 +#> 3: 1952-05-27 1994-02-03 2012-12-31 0 41.68877 +#> 4: 1952-05-27 1994-02-03 2012-12-31 0 41.68877 +#> 5: 1952-05-27 1994-02-03 2012-12-31 0 41.68877 +#> 6: 1952-05-27 1994-02-03 2012-12-31 0 41.68877 +#> 7: 1952-05-27 1994-02-03 2012-12-31 0 41.68877 +#> 8: 1952-05-27 1994-02-03 2012-12-31 0 41.68877 +#> 9: 1952-05-27 1994-02-03 2012-12-31 0 41.68877 +#> 10: 1952-05-27 1994-02-03 2012-12-31 0 41.68877 +``` + +``` r +data(popmort) +x <- lexpand(sr, birth = bi_date, entry = dg_date, exit = ex_date, + status = status %in% 1:2, + fot = 0:5, per = 1994:2000, pophaz = popmort) +print(x) +#> lex.id fot per age lex.dur lex.Cst lex.Xst sex +#> 1: 1 0.000000 1994.09 41.68877 0.90958904 0 0 1 +#> 2: 1 0.909589 1995.00 42.59836 0.09041096 0 0 1 +#> 3: 1 1.000000 1995.09 42.68877 0.90958904 0 0 1 +#> 4: 1 1.909589 1996.00 43.59836 0.09041096 0 0 1 +#> 5: 1 2.000000 1996.09 43.68877 0.90958904 0 0 1 +#> 6: 1 2.909589 1997.00 44.59836 0.09041096 0 0 1 +#> 7: 1 3.000000 1997.09 44.68877 0.90958904 0 0 1 +#> 8: 1 3.909589 1998.00 45.59836 0.09041096 0 0 1 +#> 9: 1 4.000000 1998.09 45.68877 0.90958904 0 0 1 +#> 10: 1 4.909589 1999.00 46.59836 0.09041096 0 0 1 +#> bi_date dg_date ex_date status dg_age pop.haz pp +#> 1: 1952-05-27 1994-02-03 2012-12-31 0 41.68877 0.001170685 1.000651 +#> 2: 1952-05-27 1994-02-03 2012-12-31 0 41.68877 0.001441038 1.000651 +#> 3: 1952-05-27 1994-02-03 2012-12-31 0 41.68877 0.001200721 1.001856 +#> 4: 1952-05-27 1994-02-03 2012-12-31 0 41.68877 0.001300846 1.001856 +#> 5: 1952-05-27 1994-02-03 2012-12-31 0 41.68877 0.001400981 1.003207 +#> 6: 1952-05-27 1994-02-03 2012-12-31 0 41.68877 0.002142293 1.003207 +#> 7: 1952-05-27 1994-02-03 2012-12-31 0 41.68877 0.002202424 1.005067 +#> 8: 1952-05-27 1994-02-03 2012-12-31 0 41.68877 0.001771568 1.005067 +#> 9: 1952-05-27 1994-02-03 2012-12-31 0 41.68877 0.002222468 1.007277 +#> 10: 1952-05-27 1994-02-03 2012-12-31 0 41.68877 0.002282603 1.007277 +``` + +``` r +a <- lexpand(sr, birth = bi_date, entry = dg_date, exit = ex_date, + status = status %in% 1:2, + fot = 0:5, per = 1994:2000, aggre = list(fot, per)) +print(a) +#> fot per pyrs at.risk from0to0 +#> 1: 0 1994 0.90958904 0 0 +#> 2: 0 1995 0.09041096 1 0 +#> 3: 1 1995 0.90958904 0 0 +#> 4: 1 1996 0.09041096 1 0 +#> 5: 2 1996 0.90958904 0 0 +#> 6: 2 1997 0.09041096 1 0 +#> 7: 3 1997 0.90958904 0 0 +#> 8: 3 1998 0.09041096 1 0 +#> 9: 4 1998 0.90958904 0 0 +#> 10: 4 1999 0.09041096 1 1 +``` + +SIRs / SMRs +----------- + +One can make use of the `sir` function to estimate indirectly standardised incidence or mortality ratios (SIRs/SMRs). The data can be aggregated by `lexpand` or by other means. While `sir` is simple and flexible in itself, one may also use `sirspline` to fit spline functions for the effect of e.g. age as a continuous variable on SIRs. + +``` r +data(popmort) +data(sire) +c <- lexpand( sire, status = status %in% 1:2, birth = bi_date, exit = ex_date, entry = dg_date, + breaks = list(per = 1950:2013, age = 1:100, fot = c(0,10,20,Inf)), + aggre = list(fot, agegroup = age, year = per, sex) ) +#> dropped 16 rows where entry == exit + +se <- sir( coh.data = c, coh.obs = 'from0to1', coh.pyrs = 'pyrs', + ref.data = popmort, ref.rate = 'haz', + adjust = c('agegroup', 'year', 'sex'), print = 'fot') +se +#> SIR (adjusted by agegroup, year, sex) with 95% confidence intervals (profile) +#> Test for homogeneity: p < 0.001 +#> +#> Total sir: 3.08 (2.99-3.17) +#> Total observed: 4559 +#> Total expected: 1482.13 +#> Total person-years: 39906 +#> +#> +#> fot observed expected pyrs sir sir.lo sir.hi p_value +#> 1: 0 4264 1214.54 34445.96 3.51 3.41 3.62 0.000 +#> 2: 10 295 267.59 5459.96 1.10 0.98 1.23 0.094 +``` + +(Relative) survival +------------------- + +The `survtab` function computes observed, net/relative and cause-specific survivals as well as cumulative incidence functions for `Lexis` data. Any of the supported survival time functions can be easily adjusted by any number of categorical variables if needed. + +One can also use `survtab_ag` for aggregated data. This means the data does not have to be on the subject-level to compute survival time function estimates. + +``` r +library(Epi) +#> +#> Attaching package: 'Epi' +#> The following object is masked from 'package:base': +#> +#> merge.data.frame + +data(sibr) +sire$cancer <- "rectal" +sibr$cancer <- "breast" +sr <- rbind(sire, sibr) + +sr$cancer <- factor(sr$cancer) +sr <- sr[sr$dg_date < sr$ex_date, ] + +sr$status <- factor(sr$status, levels = 0:2, + labels = c("alive", "canD", "othD")) + +x <- Lexis(entry = list(FUT = 0, AGE = dg_age, CAL = get.yrs(dg_date)), + exit = list(CAL = get.yrs(ex_date)), + data = sr, + exit.status = status) +#> NOTE: entry.status has been set to "alive" for all. + +st <- survtab(FUT ~ cancer, data = x, + breaks = list(FUT = seq(0, 5, 1/12)), + surv.type = "cif.obs") +st +#> +#> Call: +#> survtab(formula = FUT ~ cancer, data = x, breaks = list(FUT = seq(0, 5, 1/12)), surv.type = "cif.obs") +#> +#> Type arguments: +#> surv.type: cif.obs --- surv.method: hazard +#> +#> Confidence interval arguments: +#> level: 95 % --- transformation: log-log +#> +#> Totals: +#> person-time:62120 --- events: 5375 +#> +#> Stratified by: 'cancer' +#> cancer Tstop surv.obs.lo surv.obs surv.obs.hi SE.surv.obs CIF_canD +#> 1: breast 2.5 0.8804 0.8870 0.8933 0.003290 0.0687 +#> 2: breast 5.0 0.7899 0.7986 0.8070 0.004368 0.1162 +#> 3: rectal 2.5 0.6250 0.6359 0.6465 0.005480 0.2981 +#> 4: rectal 5.0 0.5032 0.5148 0.5263 0.005901 0.3727 +#> CIF_othD +#> 1: 0.0442 +#> 2: 0.0852 +#> 3: 0.0660 +#> 4: 0.1125 +``` diff --git a/build/vignette.rds b/build/vignette.rds index 36b542f693873ab1326a625f1141728eca91a302..97c4a733384d94a9a387bbc072f6b9a02d7b7c58 100644 GIT binary patch literal 253 zcmVp_a`REVs7~ta_ zzrgG@`gqrkEMEgy2~H}HQYc#~iMB#PrZBY&pj9{v2SL?7Mdqok$Zc<0j$4XdFgod{ zJB)bQ4GkiC5)b#_*e+q%oTK)Oukkc^MkDlS5dsI*6G;GOrsU< z(aQV@<7YgdYDc;|XyWQqnTO36t?PwMgm`|&Cg^#K3? DYjb%= literal 254 zcmVwya#XQ7hY$Kv5e(K#S9E=CryEl+K4T25MugV!qR z#~wzw?1qK`J&K1uIJwL4{4d7-?RWg5^#-ZE0euA>MYGsSh3Uaw`m4yFH`{m|B&P8O zk7z{tgvk@0Pqii89W=G|sZ7J>E!hv1rfDv>M3n-Uh>|%I;Gz7mlqYrh1#!3^ - %\VignetteEngine{knitr::rmarkdown} - %\VignetteIndexEntry{Standardised incidence and mortality ratios} - %\usepackage[utf8]{inputenc} ---- - -```{r, echo=TRUE, warning=FALSE, message=FALSE} -library(popEpi) -library(Epi) -library(splines) -``` - -# Introduction - -Standardized incidence ratio (SIR) or mortality ratio (SMR) is a ratio of observed and expected cases. Observed cases is the absolute number of cases in the cohort. The expected cases are derived by multiplying the cohort person-years with reference populations rate. The rate should be stratified or adjusted by confounding factors. Usually these are age group, gender, calendar period and possibly a cancer type or other confounding variable. Also a social economic status or area variable can be used. - -In reference population the expected rate in strata $j$ is $\lambda_j = d_j$ / $n_j$, where $d_j$ is observed cases and $n_j$ is observed person years. Now the SIR can be written as a ratio -$$ -SIR = \frac{ \sum d_j }{\sum n_j \lambda_j} = \frac{D}{E} -$$ -where $D$ is the observed cases in cohort population and $E$ is the expected number. Univariate confidence intervals are based on exact values of Poisson distribution and the formula for p-value is -$$ -\chi^2 = \frac{ (|O - E| -0.5)^2 }{E}. -$$ -Modelled SIR is a Poisson regression model with log-link and cohorts person-years as a offset. - -The homogenity of SIR's can be tested using a likelihood ratio test in Poisson modelled SIRs. - -The same workflow applies for standardised mortality ratios. - -# Splines - -A continuous spline function can be fitted for time variables, e.g. age-group. Idea of the splines is to smooth the SMR estimates and do inference from the curve figure. This requires pre-defined knots/nodes that are used to fit the spline curve. Selecting the number of knots and knot places is a very subjective matter and there are three options to pass spline knots to function. - -It's good practice to try between different knot settings for realistic spline estimates. Overfitting might cause unintentioal artefacts in the estimate and underfitting might smooth away interesting patterns. - -The spline variable should be as continuous as possible, say from 18 to 100 time points. But when splitting time in too narrow intervals, random variation might occur in the expected or population rate values. Therefore it's also possible to do two variables for age or period: first with wider intervals for standardation and second with narrow intervals for the spline. - -## Knots - -There are three options to for assigning knots to the spline: - -1. A vector of numbers of knots for each spline variable. Number of knots includes the boundary knots, so that the minumum number of knots is 2, which is a log linear assosiation. The knots are placed automatically using the quantiles of observed cases. - -2. A list of vectors of predefined knot places. Number of vectors needs to match the length of spline variables. And each vector has to have at least the minimum and maximum for boundary knots. - -3. NULL will automatically finds the optimal number of knots based on AIC. Knots are placed according the quantiles of observed cases. This is usually a good place to start the fitting process. - -Number of knots and knot places are always found in output. - -# SMR - -## Mortality: External cohort and popmort data - -Estimate SMR of a simulated cohort of Finnish female rectal cancer patients, `sire`. -Death rates for each age, period and sex is available in `popmort` dataset. - -For more information about the dataset see `help(popmort)` and `help(sire)`. - -```{r} -data(sire) -data(popmort) -c <- lexpand( sire, status = status, birth = bi_date, exit = ex_date, entry = dg_date, - breaks = list(per = 1950:2013, age = 1:100, fot = c(0,10,20,Inf)), - aggre = list(fot, agegroup = age, year = per, sex) ) - -se <- sir( coh.data = c, coh.obs = 'from0to2', coh.pyrs = 'pyrs', - ref.data = popmort, ref.rate = 'haz', - adjust = c('agegroup','year','sex'), print ='fot') -se -``` - -SMR's for other causes is 1 for both follow-up intervals. Also the p-value suggest that there is no heterogenity between SMR estimates (p=0.735). - - -The total mortality can be estimated by modifying the `status` argument. Now we want to account all deaths, i.e. status is 1 or 2. - -```{r} -c <- lexpand( sire, status = status %in% 1:2, birth = bi_date, exit = ex_date, entry = dg_date, - breaks = list(per = 1950:2013, age = 1:100, fot = c(0,10,20,Inf)), - aggre = list(fot, agegroup = age, year = per, sex) ) - -se <- sir( coh.data = c, coh.obs = 'from0to1', coh.pyrs = 'pyrs', - ref.data = popmort, ref.rate = 'haz', - adjust = c('agegroup','year','sex'), print ='fot') -se -``` - -Now the estimates for follow-up intervals seems to differ significantly, p = 0. Plotting SMR (S3-method for `sir`-object) is easily done using default plot-function. - -```{r, fig.height=3, fig.width=6} -plot(se, col = 2:3) -title('SMR for follow-up categories') -``` - - -## splines - - -Lets fit splines for the follow-up time and agegroup using two different options: the splines are fitted in different model and in same model, `dependent.splines`. - -```{r, fig.height=5, fig.width=6} -c <- lexpand( sire, status = status %in% 1:2, birth = bi_date, exit = ex_date, entry = dg_date, - breaks = list(per = 1950:2013, age = 1:100, fot = 0:50), - aggre = list(fot, agegroup = age, year = per, sex) ) - -sf <- sirspline( coh.data = c, coh.obs = 'from0to1', coh.pyrs = 'pyrs', - ref.data = popmort, ref.rate = 'haz', - adjust = c('agegroup','year','sex'), - spline = c('agegroup','fot'), dependent.splines=FALSE) - -st <- sirspline( coh.data = c, coh.obs = 'from0to1', coh.pyrs = 'pyrs', - ref.data = popmort, ref.rate = 'haz', - adjust = c('agegroup','year','sex'), - spline = c('agegroup','fot'), dependent.splines = TRUE) - -plot(sf, col=2, log=TRUE) -title('Splines fitted in different models') - -plot(st, col=4, log=TRUE) -title('Splines are dependent') -``` - -In dependent spline the `fot` is the ratio with zero time as reference point. Reference points can be alterned. Here agegroup profile is assumed to be same for every follow-up time. SMR is 0.2 times from 0 to 10 years of follow-up. - - -Splines can also be stratified using the `print` argument. For example we split the death time in two time periods and test if the agegroup splines are equal. - -```{r, results='hide', fig.height=5, fig.width=6} -c$year.cat <- ifelse(c$year < 2002, 1, 2) -sy <- sirspline( coh.data = c, coh.obs = 'from0to1', coh.pyrs = 'pyrs', - ref.data = popmort, ref.rate = 'haz', - adjust = c('agegroup','year','sex'), - spline = c('agegroup'), print = 'year.cat') -plot(sy, log=TRUE) -legend('topright', c('before 2002','after 2002'), lty=1, col=c(1,2)) -``` - -For category before 2002 the SMR seems to be higher after the age of 50. Also the p-value (<0.0001) indicates that there is a difference in age group trends before and after year 2002. P-value is a likelihood ratio test that compares models where splines are fitted together and separately. - -```{r} -print(sy) -``` - - - - - - - +--- +title: "SMR Vignette" +author: "Matti Rantanen" +date: "`r Sys.Date()`" +output: + html_document: + fig_caption: yes + toc: true + toc_depth: 2 +vignette: > + %\VignetteEngine{knitr::rmarkdown} + %\VignetteIndexEntry{Standardised incidence and mortality ratios} + %\usepackage[utf8]{inputenc} +--- + +```{r, echo=TRUE, warning=FALSE, message=FALSE} +library(popEpi) +library(Epi) +library(splines) +``` + +# Introduction + +Standardized incidence ratio (SIR) or mortality ratio (SMR) is a ratio of observed and expected cases. Observed cases is the absolute number of cases in the cohort. The expected cases are derived by multiplying the cohort person-years with reference populations rate. The rate should be stratified or adjusted by confounding factors. Usually these are age group, gender, calendar period and possibly a cancer type or other confounding variable. Also a social economic status or area variable can be used. + +In reference population the expected rate in strata $j$ is $\lambda_j = d_j$ / $n_j$, where $d_j$ is observed cases and $n_j$ is observed person years. Now the SIR can be written as a ratio +$$ +SIR = \frac{ \sum d_j }{\sum n_j \lambda_j} = \frac{D}{E} +$$ +where $D$ is the observed cases in cohort population and $E$ is the expected number. Univariate confidence intervals are based on exact values of Poisson distribution and the formula for p-value is +$$ +\chi^2 = \frac{ (|O - E| -0.5)^2 }{E}. +$$ +Modelled SIR is a Poisson regression model with log-link and cohorts person-years as a offset. + +The homogenity of SIR's can be tested using a likelihood ratio test in Poisson modelled SIRs. + +The same workflow applies for standardised mortality ratios. + +# Splines + +A continuous spline function can be fitted for time variables, e.g. age-group. Idea of the splines is to smooth the SMR estimates and do inference from the curve figure. This requires pre-defined knots/nodes that are used to fit the spline curve. Selecting the number of knots and knot places is a very subjective matter and there are three options to pass spline knots to function. + +It's good practice to try between different knot settings for realistic spline estimates. Overfitting might cause unintentioal artefacts in the estimate and underfitting might smooth away interesting patterns. + +The spline variable should be as continuous as possible, say from 18 to 100 time points. But when splitting time in too narrow intervals, random variation might occur in the expected or population rate values. Therefore it's also possible to do two variables for age or period: first with wider intervals for standardation and second with narrow intervals for the spline. + +## Knots + +There are three options to for assigning knots to the spline: + +1. A vector of numbers of knots for each spline variable. Number of knots includes the boundary knots, so that the minumum number of knots is 2, which is a log linear assosiation. The knots are placed automatically using the quantiles of observed cases. + +2. A list of vectors of predefined knot places. Number of vectors needs to match the length of spline variables. And each vector has to have at least the minimum and maximum for boundary knots. + +3. NULL will automatically finds the optimal number of knots based on AIC. Knots are placed according the quantiles of observed cases. This is usually a good place to start the fitting process. + +Number of knots and knot places are always found in output. + +# SMR + +## Mortality: External cohort and popmort data + +Estimate SMR of a simulated cohort of Finnish female rectal cancer patients, `sire`. +Death rates for each age, period and sex is available in `popmort` dataset. + +For more information about the dataset see `help(popmort)` and `help(sire)`. + +```{r} +data(sire) +data(popmort) +c <- lexpand( sire, status = status, birth = bi_date, exit = ex_date, entry = dg_date, + breaks = list(per = 1950:2013, age = 1:100, fot = c(0,10,20,Inf)), + aggre = list(fot, agegroup = age, year = per, sex) ) + +se <- sir( coh.data = c, coh.obs = 'from0to2', coh.pyrs = 'pyrs', + ref.data = popmort, ref.rate = 'haz', + adjust = c('agegroup','year','sex'), print ='fot') +se +``` + +SMR's for other causes is 1 for both follow-up intervals. Also the p-value suggest that there is no heterogenity between SMR estimates (p=0.735). + + +The total mortality can be estimated by modifying the `status` argument. Now we want to account all deaths, i.e. status is 1 or 2. + +```{r} +c <- lexpand( sire, status = status %in% 1:2, birth = bi_date, exit = ex_date, entry = dg_date, + breaks = list(per = 1950:2013, age = 1:100, fot = c(0,10,20,Inf)), + aggre = list(fot, agegroup = age, year = per, sex) ) + +se <- sir( coh.data = c, coh.obs = 'from0to1', coh.pyrs = 'pyrs', + ref.data = popmort, ref.rate = 'haz', + adjust = c('agegroup','year','sex'), print ='fot') +se +``` + +Now the estimates for follow-up intervals seems to differ significantly, p = 0. Plotting SMR (S3-method for `sir`-object) is easily done using default plot-function. + +```{r, fig.height=3, fig.width=6} +plot(se, col = 2:3) +title('SMR for follow-up categories') +``` + + +## splines + + +Lets fit splines for the follow-up time and agegroup using two different options: the splines are fitted in different model and in same model, `dependent.splines`. + +```{r, fig.height=5, fig.width=6} +c <- lexpand( sire, status = status %in% 1:2, birth = bi_date, exit = ex_date, entry = dg_date, + breaks = list(per = 1950:2013, age = 1:100, fot = 0:50), + aggre = list(fot, agegroup = age, year = per, sex) ) + +sf <- sirspline( coh.data = c, coh.obs = 'from0to1', coh.pyrs = 'pyrs', + ref.data = popmort, ref.rate = 'haz', + adjust = c('agegroup','year','sex'), + spline = c('agegroup','fot'), dependent.splines=FALSE) + +st <- sirspline( coh.data = c, coh.obs = 'from0to1', coh.pyrs = 'pyrs', + ref.data = popmort, ref.rate = 'haz', + adjust = c('agegroup','year','sex'), + spline = c('agegroup','fot'), dependent.splines = TRUE) + +plot(sf, col=2, log=TRUE) +title('Splines fitted in different models') + +plot(st, col=4, log=TRUE) +title('Splines are dependent') +``` + +In dependent spline the `fot` is the ratio with zero time as reference point. Reference points can be alterned. Here agegroup profile is assumed to be same for every follow-up time. SMR is 0.2 times from 0 to 10 years of follow-up. + + +Splines can also be stratified using the `print` argument. For example we split the death time in two time periods and test if the agegroup splines are equal. + +```{r, results='hide', fig.height=5, fig.width=6} +c$year.cat <- ifelse(c$year < 2002, 1, 2) +sy <- sirspline( coh.data = c, coh.obs = 'from0to1', coh.pyrs = 'pyrs', + ref.data = popmort, ref.rate = 'haz', + adjust = c('agegroup','year','sex'), + spline = c('agegroup'), print = 'year.cat') +plot(sy, log=TRUE) +legend('topright', c('before 2002','after 2002'), lty=1, col=c(1,2)) +``` + +For category before 2002 the SMR seems to be higher after the age of 50. Also the p-value (<0.0001) indicates that there is a difference in age group trends before and after year 2002. P-value is a likelihood ratio test that compares models where splines are fitted together and separately. + +```{r} +print(sy) +``` + + + + + + + diff --git a/inst/doc/sir.html b/inst/doc/sir.html index b414afa..99fe469 100644 --- a/inst/doc/sir.html +++ b/inst/doc/sir.html @@ -1,300 +1,300 @@ - - - - - - - - - - - - - - - -SMR Vignette - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - - - - - - - - - -
library(popEpi)
-library(Epi)
-library(splines)
-
-

Introduction

-

Standardized incidence ratio (SIR) or mortality ratio (SMR) is a ratio of observed and expected cases. Observed cases is the absolute number of cases in the cohort. The expected cases are derived by multiplying the cohort person-years with reference populations rate. The rate should be stratified or adjusted by confounding factors. Usually these are age group, gender, calendar period and possibly a cancer type or other confounding variable. Also a social economic status or area variable can be used.

-

In reference population the expected rate in strata \(j\) is \(\lambda_j = d_j\) / \(n_j\), where \(d_j\) is observed cases and \(n_j\) is observed person years. Now the SIR can be written as a ratio \[ -SIR = \frac{ \sum d_j }{\sum n_j \lambda_j} = \frac{D}{E} -\] where \(D\) is the observed cases in cohort population and \(E\) is the expected number. Univariate confidence intervals are based on exact values of Poisson distribution and the formula for p-value is \[ -\chi^2 = \frac{ (|O - E| -0.5)^2 }{E}. -\] Modelled SIR is a Poisson regression model with log-link and cohorts person-years as a offset.

-

The homogenity of SIR’s can be tested using a likelihood ratio test in Poisson modelled SIRs.

-

The same workflow applies for standardised mortality ratios.

-
-
-

Splines

-

A continuous spline function can be fitted for time variables, e.g. age-group. Idea of the splines is to smooth the SMR estimates and do inference from the curve figure. This requires pre-defined knots/nodes that are used to fit the spline curve. Selecting the number of knots and knot places is a very subjective matter and there are three options to pass spline knots to function.

-

It’s good practice to try between different knot settings for realistic spline estimates. Overfitting might cause unintentioal artefacts in the estimate and underfitting might smooth away interesting patterns.

-

The spline variable should be as continuous as possible, say from 18 to 100 time points. But when splitting time in too narrow intervals, random variation might occur in the expected or population rate values. Therefore it’s also possible to do two variables for age or period: first with wider intervals for standardation and second with narrow intervals for the spline.

-
-

Knots

-

There are three options to for assigning knots to the spline:

-
    -
  1. A vector of numbers of knots for each spline variable. Number of knots includes the boundary knots, so that the minumum number of knots is 2, which is a log linear assosiation. The knots are placed automatically using the quantiles of observed cases.

  2. -
  3. A list of vectors of predefined knot places. Number of vectors needs to match the length of spline variables. And each vector has to have at least the minimum and maximum for boundary knots.

  4. -
  5. NULL will automatically finds the optimal number of knots based on AIC. Knots are placed according the quantiles of observed cases. This is usually a good place to start the fitting process.

  6. -
-

Number of knots and knot places are always found in output.

-
-
-
-

SMR

-
-

Mortality: External cohort and popmort data

-

Estimate SMR of a simulated cohort of Finnish female rectal cancer patients, sire. Death rates for each age, period and sex is available in popmort dataset.

-

For more information about the dataset see help(popmort) and help(sire).

-
data(sire)
-data(popmort)
-c <- lexpand( sire, status = status, birth = bi_date, exit = ex_date, entry = dg_date,
-              breaks = list(per = 1950:2013, age = 1:100, fot = c(0,10,20,Inf)), 
-              aggre = list(fot, agegroup = age, year = per, sex) )
-
## dropped 16 rows where entry == exit
-
se <- sir( coh.data = c, coh.obs = 'from0to2', coh.pyrs = 'pyrs',
-           ref.data = popmort, ref.rate = 'haz', 
-           adjust = c('agegroup','year','sex'), print ='fot')
-se
-
## SIR (adjusted by agegroup, year, sex) with 95% confidence intervals (profile) 
-## Test for homogeneity: p = 0.735 
-## 
-##  Total sir: 1.01 (0.95-1.06)
-##  Total observed: 1490
-##  Total expected: 1482.13
-##  Total person-years: 39906 
-## 
-## 
-##    fot observed expected     pyrs  sir sir.lo sir.hi p_value
-## 1:   0     1226  1214.54 34445.96 1.01   0.95   1.07  0.7423
-## 2:  10      264   267.59  5459.96 0.99   0.87   1.11  0.8262
-

SMR’s for other causes is 1 for both follow-up intervals. Also the p-value suggest that there is no heterogenity between SMR estimates (p=0.735).

-

The total mortality can be estimated by modifying the status argument. Now we want to account all deaths, i.e. status is 1 or 2.

-
c <- lexpand( sire, status = status %in% 1:2, birth = bi_date, exit = ex_date, entry = dg_date,
-              breaks = list(per = 1950:2013, age = 1:100, fot = c(0,10,20,Inf)), 
-              aggre = list(fot, agegroup = age, year = per, sex) )
-
## dropped 16 rows where entry == exit
-
se <- sir( coh.data = c, coh.obs = 'from0to1', coh.pyrs = 'pyrs',
-           ref.data = popmort, ref.rate = 'haz', 
-           adjust = c('agegroup','year','sex'), print ='fot')
-se
-
## SIR (adjusted by agegroup, year, sex) with 95% confidence intervals (profile) 
-## Test for homogeneity: p < 0.001 
-## 
-##  Total sir: 3.08 (2.99-3.17)
-##  Total observed: 4559
-##  Total expected: 1482.13
-##  Total person-years: 39906 
-## 
-## 
-##    fot observed expected     pyrs  sir sir.lo sir.hi p_value
-## 1:   0     4264  1214.54 34445.96 3.51   3.41   3.62   0.000
-## 2:  10      295   267.59  5459.96 1.10   0.98   1.23   0.094
-

Now the estimates for follow-up intervals seems to differ significantly, p = 0. Plotting SMR (S3-method for sir-object) is easily done using default plot-function.

-
plot(se, col = 2:3)
-title('SMR for follow-up categories')
-

-
-
-

splines

-

Lets fit splines for the follow-up time and agegroup using two different options: the splines are fitted in different model and in same model, dependent.splines.

-
c <- lexpand( sire, status = status %in% 1:2, birth = bi_date, exit = ex_date, entry = dg_date,
-              breaks = list(per = 1950:2013, age = 1:100, fot = 0:50), 
-              aggre = list(fot, agegroup = age, year = per, sex) )
-
## dropped 16 rows where entry == exit
-
sf <- sirspline( coh.data = c, coh.obs = 'from0to1', coh.pyrs = 'pyrs', 
-                 ref.data = popmort, ref.rate = 'haz', 
-                 adjust = c('agegroup','year','sex'),
-                 spline = c('agegroup','fot'), dependent.splines=FALSE)
-
-st <- sirspline( coh.data = c, coh.obs = 'from0to1', coh.pyrs = 'pyrs', 
-                 ref.data = popmort, ref.rate = 'haz', 
-                 adjust = c('agegroup','year','sex'),
-                 spline = c('agegroup','fot'), dependent.splines = TRUE)
-
-plot(sf, col=2, log=TRUE)
-title('Splines fitted in different models')
-

-
plot(st, col=4, log=TRUE)
-title('Splines are dependent')
-

-

In dependent spline the fot is the ratio with zero time as reference point. Reference points can be alterned. Here agegroup profile is assumed to be same for every follow-up time. SMR is 0.2 times from 0 to 10 years of follow-up.

-

Splines can also be stratified using the print argument. For example we split the death time in two time periods and test if the agegroup splines are equal.

-
c$year.cat <- ifelse(c$year < 2002, 1, 2)
-sy <- sirspline( coh.data = c, coh.obs = 'from0to1', coh.pyrs = 'pyrs', 
-                 ref.data = popmort, ref.rate = 'haz', 
-                 adjust = c('agegroup','year','sex'),
-                 spline = c('agegroup'), print = 'year.cat')
-plot(sy, log=TRUE)
-legend('topright', c('before 2002','after 2002'), lty=1, col=c(1,2))
-

-

For category before 2002 the SMR seems to be higher after the age of 50. Also the p-value (<0.0001) indicates that there is a difference in age group trends before and after year 2002. P-value is a likelihood ratio test that compares models where splines are fitted together and separately.

-
print(sy)
-
## agegroup: p < 0.001
-## NA: p = NA
-## NA: p = NA
-## 
-##   levels colour
-## 1      1  black
-## 2      2    red
-
-
- - - - -
- - - - - - - - + + + + + + + + + + + + + + + +SMR Vignette + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + +
library(popEpi)
+library(Epi)
+library(splines)
+
+

Introduction

+

Standardized incidence ratio (SIR) or mortality ratio (SMR) is a ratio of observed and expected cases. Observed cases is the absolute number of cases in the cohort. The expected cases are derived by multiplying the cohort person-years with reference populations rate. The rate should be stratified or adjusted by confounding factors. Usually these are age group, gender, calendar period and possibly a cancer type or other confounding variable. Also a social economic status or area variable can be used.

+

In reference population the expected rate in strata \(j\) is \(\lambda_j = d_j\) / \(n_j\), where \(d_j\) is observed cases and \(n_j\) is observed person years. Now the SIR can be written as a ratio \[ +SIR = \frac{ \sum d_j }{\sum n_j \lambda_j} = \frac{D}{E} +\] where \(D\) is the observed cases in cohort population and \(E\) is the expected number. Univariate confidence intervals are based on exact values of Poisson distribution and the formula for p-value is \[ +\chi^2 = \frac{ (|O - E| -0.5)^2 }{E}. +\] Modelled SIR is a Poisson regression model with log-link and cohorts person-years as a offset.

+

The homogenity of SIR’s can be tested using a likelihood ratio test in Poisson modelled SIRs.

+

The same workflow applies for standardised mortality ratios.

+
+
+

Splines

+

A continuous spline function can be fitted for time variables, e.g. age-group. Idea of the splines is to smooth the SMR estimates and do inference from the curve figure. This requires pre-defined knots/nodes that are used to fit the spline curve. Selecting the number of knots and knot places is a very subjective matter and there are three options to pass spline knots to function.

+

It’s good practice to try between different knot settings for realistic spline estimates. Overfitting might cause unintentioal artefacts in the estimate and underfitting might smooth away interesting patterns.

+

The spline variable should be as continuous as possible, say from 18 to 100 time points. But when splitting time in too narrow intervals, random variation might occur in the expected or population rate values. Therefore it’s also possible to do two variables for age or period: first with wider intervals for standardation and second with narrow intervals for the spline.

+
+

Knots

+

There are three options to for assigning knots to the spline:

+
    +
  1. A vector of numbers of knots for each spline variable. Number of knots includes the boundary knots, so that the minumum number of knots is 2, which is a log linear assosiation. The knots are placed automatically using the quantiles of observed cases.

  2. +
  3. A list of vectors of predefined knot places. Number of vectors needs to match the length of spline variables. And each vector has to have at least the minimum and maximum for boundary knots.

  4. +
  5. NULL will automatically finds the optimal number of knots based on AIC. Knots are placed according the quantiles of observed cases. This is usually a good place to start the fitting process.

  6. +
+

Number of knots and knot places are always found in output.

+
+
+
+

SMR

+
+

Mortality: External cohort and popmort data

+

Estimate SMR of a simulated cohort of Finnish female rectal cancer patients, sire. Death rates for each age, period and sex is available in popmort dataset.

+

For more information about the dataset see help(popmort) and help(sire).

+
data(sire)
+data(popmort)
+c <- lexpand( sire, status = status, birth = bi_date, exit = ex_date, entry = dg_date,
+              breaks = list(per = 1950:2013, age = 1:100, fot = c(0,10,20,Inf)), 
+              aggre = list(fot, agegroup = age, year = per, sex) )
+
## dropped 16 rows where entry == exit
+
se <- sir( coh.data = c, coh.obs = 'from0to2', coh.pyrs = 'pyrs',
+           ref.data = popmort, ref.rate = 'haz', 
+           adjust = c('agegroup','year','sex'), print ='fot')
+se
+
## SIR (adjusted by agegroup, year, sex) with 95% confidence intervals (profile) 
+## Test for homogeneity: p = 0.735 
+## 
+##  Total sir: 1.01 (0.95-1.06)
+##  Total observed: 1490
+##  Total expected: 1482.13
+##  Total person-years: 39906 
+## 
+## 
+##    fot observed expected     pyrs  sir sir.lo sir.hi p_value
+## 1:   0     1226  1214.54 34445.96 1.01   0.95   1.07  0.7423
+## 2:  10      264   267.59  5459.96 0.99   0.87   1.11  0.8262
+

SMR’s for other causes is 1 for both follow-up intervals. Also the p-value suggest that there is no heterogenity between SMR estimates (p=0.735).

+

The total mortality can be estimated by modifying the status argument. Now we want to account all deaths, i.e. status is 1 or 2.

+
c <- lexpand( sire, status = status %in% 1:2, birth = bi_date, exit = ex_date, entry = dg_date,
+              breaks = list(per = 1950:2013, age = 1:100, fot = c(0,10,20,Inf)), 
+              aggre = list(fot, agegroup = age, year = per, sex) )
+
## dropped 16 rows where entry == exit
+
se <- sir( coh.data = c, coh.obs = 'from0to1', coh.pyrs = 'pyrs',
+           ref.data = popmort, ref.rate = 'haz', 
+           adjust = c('agegroup','year','sex'), print ='fot')
+se
+
## SIR (adjusted by agegroup, year, sex) with 95% confidence intervals (profile) 
+## Test for homogeneity: p < 0.001 
+## 
+##  Total sir: 3.08 (2.99-3.17)
+##  Total observed: 4559
+##  Total expected: 1482.13
+##  Total person-years: 39906 
+## 
+## 
+##    fot observed expected     pyrs  sir sir.lo sir.hi p_value
+## 1:   0     4264  1214.54 34445.96 3.51   3.41   3.62   0.000
+## 2:  10      295   267.59  5459.96 1.10   0.98   1.23   0.094
+

Now the estimates for follow-up intervals seems to differ significantly, p = 0. Plotting SMR (S3-method for sir-object) is easily done using default plot-function.

+
plot(se, col = 2:3)
+title('SMR for follow-up categories')
+

+
+
+

splines

+

Lets fit splines for the follow-up time and agegroup using two different options: the splines are fitted in different model and in same model, dependent.splines.

+
c <- lexpand( sire, status = status %in% 1:2, birth = bi_date, exit = ex_date, entry = dg_date,
+              breaks = list(per = 1950:2013, age = 1:100, fot = 0:50), 
+              aggre = list(fot, agegroup = age, year = per, sex) )
+
## dropped 16 rows where entry == exit
+
sf <- sirspline( coh.data = c, coh.obs = 'from0to1', coh.pyrs = 'pyrs', 
+                 ref.data = popmort, ref.rate = 'haz', 
+                 adjust = c('agegroup','year','sex'),
+                 spline = c('agegroup','fot'), dependent.splines=FALSE)
+
+st <- sirspline( coh.data = c, coh.obs = 'from0to1', coh.pyrs = 'pyrs', 
+                 ref.data = popmort, ref.rate = 'haz', 
+                 adjust = c('agegroup','year','sex'),
+                 spline = c('agegroup','fot'), dependent.splines = TRUE)
+
+plot(sf, col=2, log=TRUE)
+title('Splines fitted in different models')
+

+
plot(st, col=4, log=TRUE)
+title('Splines are dependent')
+

+

In dependent spline the fot is the ratio with zero time as reference point. Reference points can be alterned. Here agegroup profile is assumed to be same for every follow-up time. SMR is 0.2 times from 0 to 10 years of follow-up.

+

Splines can also be stratified using the print argument. For example we split the death time in two time periods and test if the agegroup splines are equal.

+
c$year.cat <- ifelse(c$year < 2002, 1, 2)
+sy <- sirspline( coh.data = c, coh.obs = 'from0to1', coh.pyrs = 'pyrs', 
+                 ref.data = popmort, ref.rate = 'haz', 
+                 adjust = c('agegroup','year','sex'),
+                 spline = c('agegroup'), print = 'year.cat')
+plot(sy, log=TRUE)
+legend('topright', c('before 2002','after 2002'), lty=1, col=c(1,2))
+

+

For category before 2002 the SMR seems to be higher after the age of 50. Also the p-value (<0.0001) indicates that there is a difference in age group trends before and after year 2002. P-value is a likelihood ratio test that compares models where splines are fitted together and separately.

+
print(sy)
+
## agegroup: p < 0.001
+## NA: p = NA
+## NA: p = NA
+## 
+##   levels colour
+## 1      1  black
+## 2      2    red
+
+
+ + + + +
+ + + + + + + + diff --git a/inst/doc/survtab_examples.R b/inst/doc/survtab_examples.R index 31f3161..6657835 100644 --- a/inst/doc/survtab_examples.R +++ b/inst/doc/survtab_examples.R @@ -1,170 +1,170 @@ -## ----pkgs, eval = TRUE, echo = TRUE, message = FALSE--------------------- -library(popEpi) -library(Epi) -library(survival) - -## ------------------------------------------------------------------------ -data(sire) - -## NOTE: recommended to use factor status variable -x <- Lexis(entry = list(FUT = 0, AGE = dg_age, CAL = get.yrs(dg_date)), - exit = list(CAL = get.yrs(ex_date)), - data = sire[sire$dg_date < sire$ex_date, ], - exit.status = factor(status, levels = 0:2, - labels = c("alive", "canD", "othD")), - merge = TRUE) - -## pretend some are male -set.seed(1L) -x$sex <- rbinom(nrow(x), 1, 0.5) - -## observed survival - explicit method -st <- survtab(Surv(time = FUT, event = lex.Xst) ~ sex, data = x, - surv.type = "surv.obs", - breaks = list(FUT = seq(0, 5, 1/12))) - -## observed survival - easy method (assumes lex.Xst in x is the status variable) -st <- survtab(FUT ~ sex, data = x, - surv.type = "surv.obs", - breaks = list(FUT = seq(0, 5, 1/12))) - -## printing gives the used settings and -## estimates at the middle and end of the estimated -## curves; more information available using summary() -st - - -## ------------------------------------------------------------------------ -plot(st, col = c("blue", "red")) - -## ----popmort------------------------------------------------------------- -data(popmort) -pm <- data.frame(popmort) -names(pm) <- c("sex", "CAL", "AGE", "haz") -head(pm) - -## ----survtab_e2---------------------------------------------------------- -st.e2 <- survtab(Surv(time = FUT, event = lex.Xst) ~ sex, data = x, - surv.type = "surv.rel", relsurv.method = "e2", - breaks = list(FUT = seq(0, 5, 1/12)), - pophaz = pm) - -## ------------------------------------------------------------------------ -plot(st.e2, y = "r.e2", col = c("blue", "red")) - -## ----survtab_pp---------------------------------------------------------- -st.pp <- survtab(Surv(time = FUT, event = lex.Xst) ~ sex, data = x, - surv.type = "surv.rel", relsurv.method = "pp", - breaks = list(FUT = seq(0, 5, 1/12)), - pophaz = pm) - -## ------------------------------------------------------------------------ -plot(st.e2, y = "r.e2", col = c("blue", "red"), lty = 1) -lines(st.pp, y = "r.pp", col = c("blue", "red"), lty = 2) - -## ----survtab_adjust------------------------------------------------------ -## an age group variable -x$agegr <- cut(x$dg_age, c(0, 60, 70, 80, Inf), right = FALSE) - -## using "internal weights" - see ?ICSS for international weights standards -w <- table(x$agegr) -w - -w <- list(agegr = as.numeric(w)) - -## ----survtab_adjust_2---------------------------------------------------- -st.as <- survtab(Surv(time = FUT, event = lex.Xst) ~ sex + adjust(agegr), - data = x, weights = w, - surv.type = "surv.rel", relsurv.method = "e2", - breaks = list(FUT = seq(0, 5, 1/12)), - pophaz = pm) - -## ------------------------------------------------------------------------ -plot(st.as, y = "r.e2.as", col = c("blue", "red")) - -## ----weights_examples, eval = TRUE--------------------------------------- -list(sex = c(0.4, 0.6), agegr = c(0.2, 0.2, 0.4, 0.2)) - -wdf <- merge(0:1, 1:4) -names(wdf) <- c("sex", "agegr") -wdf$weights <- c(0.1, 0.1, 0.1, 0.1, 0.2, 0.2, 0.1, 0.1) -wdf - -## ----survtab_adjust_3---------------------------------------------------- -st.as <- survtab(Surv(time = FUT, event = lex.Xst) ~ sex, - adjust = "agegr", - data = x, weights = w, - surv.type = "surv.rel", relsurv.method = "e2", - breaks = list(FUT = seq(0, 5, 1/12)), - pophaz = pm) - -## ----survtab_cause------------------------------------------------------- -st.ca <- survtab(Surv(time = FUT, event = lex.Xst) ~ 1, - data = x, - surv.type = "surv.cause", - breaks = list(FUT = seq(0, 5, 1/12))) - -st.pp <- survtab(Surv(time = FUT, event = lex.Xst) ~ 1, data = x, - surv.type = "surv.rel", relsurv.method = "pp", - breaks = list(FUT = seq(0, 5, 1/12)), - pophaz = pm) - -plot(st.ca, y = "surv.obs.canD", col = "blue") -lines(st.pp, y = "r.pp", col = "red") - -## ----survtab_cif--------------------------------------------------------- -st.cif <- survtab(Surv(time = FUT, event = lex.Xst) ~ 1, - data = x, - surv.type = "cif.obs", - breaks = list(FUT = seq(0, 5, 1/12))) - -plot(st.cif, y = "CIF_canD", conf.int = FALSE) -lines(st.cif, y = "CIF_othD", conf.int = FALSE, col = "red") - -## ----survtab_relcif------------------------------------------------------ -st.cir <- survtab(Surv(time = FUT, event = lex.Xst) ~ 1, - data = x, - surv.type = "cif.rel", - breaks = list(FUT = seq(0, 5, 1/12)), - pophaz = pm) -plot(st.cif, y = "CIF_canD", conf.int = FALSE, col = "blue") -lines(st.cir, y = "CIF.rel", conf.int = FALSE, col = "red") - -## ------------------------------------------------------------------------ -sire$sex <- rbinom(nrow(sire), size = 1, prob = 0.5) -ag <- lexpand(sire, birth = "bi_date", entry = "dg_date", exit = "ex_date", - status = "status", breaks = list(fot = seq(0, 5, 1/12)), - aggre = list(sex, fot)) -head(ag) - -## ----survtab_ag_example1------------------------------------------------- -st <- survtab_ag(fot ~ sex, data = ag, surv.type = "surv.obs", - surv.method = "hazard", - d = c("from0to1", "from0to2"), pyrs = "pyrs") - -## ----survtab_ag_example2------------------------------------------------- -st <- survtab_ag(fot ~ sex, data = ag, surv.type = "surv.obs", - surv.method = "lifetable", - d = c("from0to1", "from0to2"), n = "at.risk", - n.cens = "from0to0") - -## ----survtab_ag_cause---------------------------------------------------- -st.ca <- survtab_ag(fot ~ sex, data = ag, surv.type = "surv.cause", - surv.method = "hazard", - d = list(canD = from0to1, othD = from0to2), pyrs = "pyrs") -plot(st.ca, y = "surv.obs.canD", col = c("blue", "red")) - -## ------------------------------------------------------------------------ -ag <- lexpand(sire, birth = "bi_date", entry = "dg_date", exit = "ex_date", - status = "status", breaks = list(fot = seq(0, 5, 1/12)), - pophaz = popmort, pp = TRUE, - aggre = list(sex, fot)) - -st.pp <- survtab_ag(fot ~ sex, data = ag, surv.type = "surv.rel", - surv.method = "hazard", relsurv.method = "pp", - d = list(from0to1 + from0to2), pyrs = "pyrs", - d.pp = list(from0to1.pp + from0to2.pp), - d.pp.2 = list(from0to1.pp.2 + from0to2.pp.2), - pyrs.pp = "ptime.pp", d.exp.pp = "d.exp.pp") -plot(st.pp, y = "r.pp", col = c("blue", "red")) - +## ----pkgs, eval = TRUE, echo = TRUE, message = FALSE--------------------- +library(popEpi) +library(Epi) +library(survival) + +## ------------------------------------------------------------------------ +data(sire) + +## NOTE: recommended to use factor status variable +x <- Lexis(entry = list(FUT = 0, AGE = dg_age, CAL = get.yrs(dg_date)), + exit = list(CAL = get.yrs(ex_date)), + data = sire[sire$dg_date < sire$ex_date, ], + exit.status = factor(status, levels = 0:2, + labels = c("alive", "canD", "othD")), + merge = TRUE) + +## pretend some are male +set.seed(1L) +x$sex <- rbinom(nrow(x), 1, 0.5) + +## observed survival - explicit method +st <- survtab(Surv(time = FUT, event = lex.Xst) ~ sex, data = x, + surv.type = "surv.obs", + breaks = list(FUT = seq(0, 5, 1/12))) + +## observed survival - easy method (assumes lex.Xst in x is the status variable) +st <- survtab(FUT ~ sex, data = x, + surv.type = "surv.obs", + breaks = list(FUT = seq(0, 5, 1/12))) + +## printing gives the used settings and +## estimates at the middle and end of the estimated +## curves; more information available using summary() +st + + +## ------------------------------------------------------------------------ +plot(st, col = c("blue", "red")) + +## ----popmort------------------------------------------------------------- +data(popmort) +pm <- data.frame(popmort) +names(pm) <- c("sex", "CAL", "AGE", "haz") +head(pm) + +## ----survtab_e2---------------------------------------------------------- +st.e2 <- survtab(Surv(time = FUT, event = lex.Xst) ~ sex, data = x, + surv.type = "surv.rel", relsurv.method = "e2", + breaks = list(FUT = seq(0, 5, 1/12)), + pophaz = pm) + +## ------------------------------------------------------------------------ +plot(st.e2, y = "r.e2", col = c("blue", "red")) + +## ----survtab_pp---------------------------------------------------------- +st.pp <- survtab(Surv(time = FUT, event = lex.Xst) ~ sex, data = x, + surv.type = "surv.rel", relsurv.method = "pp", + breaks = list(FUT = seq(0, 5, 1/12)), + pophaz = pm) + +## ------------------------------------------------------------------------ +plot(st.e2, y = "r.e2", col = c("blue", "red"), lty = 1) +lines(st.pp, y = "r.pp", col = c("blue", "red"), lty = 2) + +## ----survtab_adjust------------------------------------------------------ +## an age group variable +x$agegr <- cut(x$dg_age, c(0, 60, 70, 80, Inf), right = FALSE) + +## using "internal weights" - see ?ICSS for international weights standards +w <- table(x$agegr) +w + +w <- list(agegr = as.numeric(w)) + +## ----survtab_adjust_2---------------------------------------------------- +st.as <- survtab(Surv(time = FUT, event = lex.Xst) ~ sex + adjust(agegr), + data = x, weights = w, + surv.type = "surv.rel", relsurv.method = "e2", + breaks = list(FUT = seq(0, 5, 1/12)), + pophaz = pm) + +## ------------------------------------------------------------------------ +plot(st.as, y = "r.e2.as", col = c("blue", "red")) + +## ----weights_examples, eval = TRUE--------------------------------------- +list(sex = c(0.4, 0.6), agegr = c(0.2, 0.2, 0.4, 0.2)) + +wdf <- merge(0:1, 1:4) +names(wdf) <- c("sex", "agegr") +wdf$weights <- c(0.1, 0.1, 0.1, 0.1, 0.2, 0.2, 0.1, 0.1) +wdf + +## ----survtab_adjust_3---------------------------------------------------- +st.as <- survtab(Surv(time = FUT, event = lex.Xst) ~ sex, + adjust = "agegr", + data = x, weights = w, + surv.type = "surv.rel", relsurv.method = "e2", + breaks = list(FUT = seq(0, 5, 1/12)), + pophaz = pm) + +## ----survtab_cause------------------------------------------------------- +st.ca <- survtab(Surv(time = FUT, event = lex.Xst) ~ 1, + data = x, + surv.type = "surv.cause", + breaks = list(FUT = seq(0, 5, 1/12))) + +st.pp <- survtab(Surv(time = FUT, event = lex.Xst) ~ 1, data = x, + surv.type = "surv.rel", relsurv.method = "pp", + breaks = list(FUT = seq(0, 5, 1/12)), + pophaz = pm) + +plot(st.ca, y = "surv.obs.canD", col = "blue") +lines(st.pp, y = "r.pp", col = "red") + +## ----survtab_cif--------------------------------------------------------- +st.cif <- survtab(Surv(time = FUT, event = lex.Xst) ~ 1, + data = x, + surv.type = "cif.obs", + breaks = list(FUT = seq(0, 5, 1/12))) + +plot(st.cif, y = "CIF_canD", conf.int = FALSE) +lines(st.cif, y = "CIF_othD", conf.int = FALSE, col = "red") + +## ----survtab_relcif------------------------------------------------------ +st.cir <- survtab(Surv(time = FUT, event = lex.Xst) ~ 1, + data = x, + surv.type = "cif.rel", + breaks = list(FUT = seq(0, 5, 1/12)), + pophaz = pm) +plot(st.cif, y = "CIF_canD", conf.int = FALSE, col = "blue") +lines(st.cir, y = "CIF.rel", conf.int = FALSE, col = "red") + +## ------------------------------------------------------------------------ +sire$sex <- rbinom(nrow(sire), size = 1, prob = 0.5) +ag <- lexpand(sire, birth = "bi_date", entry = "dg_date", exit = "ex_date", + status = "status", breaks = list(fot = seq(0, 5, 1/12)), + aggre = list(sex, fot)) +head(ag) + +## ----survtab_ag_example1------------------------------------------------- +st <- survtab_ag(fot ~ sex, data = ag, surv.type = "surv.obs", + surv.method = "hazard", + d = c("from0to1", "from0to2"), pyrs = "pyrs") + +## ----survtab_ag_example2------------------------------------------------- +st <- survtab_ag(fot ~ sex, data = ag, surv.type = "surv.obs", + surv.method = "lifetable", + d = c("from0to1", "from0to2"), n = "at.risk", + n.cens = "from0to0") + +## ----survtab_ag_cause---------------------------------------------------- +st.ca <- survtab_ag(fot ~ sex, data = ag, surv.type = "surv.cause", + surv.method = "hazard", + d = list(canD = from0to1, othD = from0to2), pyrs = "pyrs") +plot(st.ca, y = "surv.obs.canD", col = c("blue", "red")) + +## ------------------------------------------------------------------------ +ag <- lexpand(sire, birth = "bi_date", entry = "dg_date", exit = "ex_date", + status = "status", breaks = list(fot = seq(0, 5, 1/12)), + pophaz = popmort, pp = TRUE, + aggre = list(sex, fot)) + +st.pp <- survtab_ag(fot ~ sex, data = ag, surv.type = "surv.rel", + surv.method = "hazard", relsurv.method = "pp", + d = list(from0to1 + from0to2), pyrs = "pyrs", + d.pp = list(from0to1.pp + from0to2.pp), + d.pp.2 = list(from0to1.pp.2 + from0to2.pp.2), + pyrs.pp = "ptime.pp", d.exp.pp = "d.exp.pp") +plot(st.pp, y = "r.pp", col = c("blue", "red")) + diff --git a/inst/doc/survtab_examples.Rmd b/inst/doc/survtab_examples.Rmd index 66c1851..0db0570 100644 --- a/inst/doc/survtab_examples.Rmd +++ b/inst/doc/survtab_examples.Rmd @@ -1,296 +1,296 @@ ---- -title: "Examples of using survtab" -author: "Joonas Miettinen" -date: "`r Sys.Date()`" -output: - html_document: - toc: true - toc_depth: 2 - fig_width: 6 - fig_height: 6 -vignette: > - %\VignetteEngine{knitr::rmarkdown} - %\VignetteIndexEntry{survtab examples} - %\usepackage[utf8]{inputenc} ---- - -# Overview - -This vignette aims to clarify the usage of the `survtab_ag` and `survtab` functions included in this package. `survtab_ag` estimates various survival functions and cumulative incidence functions (CIFs) non-parametrically using aggregated data, and `survtab` is a wrapper for `survtab_ag`, to which `Lexis` data is supplied. - -Two methods (`surv.method`) are currently supported: The `"lifetable"` (actuarial) method only makes use of counts when estimating any of the supported survival time functions. The default method (`"hazard"`}) estimates appropriate hazards and transforms them into survival function or CIF estimates. - -For relative survival estimation we need also to enumerate the expected hazard levels for the subjects in the data. This is done by merging expected hazards to individuals' subintervals (which divide their survival time lines to a number of small intervals). For Pohar-Perme-weighted analyses one must additionally compute various weighted figures at the level of split subject data. - -If one has subject-level data, the simplest way of computing survival function estimates with `popEpi` is by defining a `Lexis` object and using `survtab`, which will do the rest. For pre-aggregated data one may use the `survtab_ag` function instead. One can also use the `lexpand` function to split, merge population hazards, and aggregate in a single function call and then use `survtab_ag` if that is convenient. - -# Using `survtab` - -It is straightforward to estimate various survival time functions with `survtab` once a `Lexis` object has been defined (see `?Lexis` in package `Epi` for details): - -```{r pkgs, eval = TRUE, echo = TRUE, message = FALSE} -library(popEpi) -library(Epi) -library(survival) -``` - -```{r} -data(sire) - -## NOTE: recommended to use factor status variable -x <- Lexis(entry = list(FUT = 0, AGE = dg_age, CAL = get.yrs(dg_date)), - exit = list(CAL = get.yrs(ex_date)), - data = sire[sire$dg_date < sire$ex_date, ], - exit.status = factor(status, levels = 0:2, - labels = c("alive", "canD", "othD")), - merge = TRUE) - -## pretend some are male -set.seed(1L) -x$sex <- rbinom(nrow(x), 1, 0.5) - -## observed survival - explicit method -st <- survtab(Surv(time = FUT, event = lex.Xst) ~ sex, data = x, - surv.type = "surv.obs", - breaks = list(FUT = seq(0, 5, 1/12))) - -## observed survival - easy method (assumes lex.Xst in x is the status variable) -st <- survtab(FUT ~ sex, data = x, - surv.type = "surv.obs", - breaks = list(FUT = seq(0, 5, 1/12))) - -## printing gives the used settings and -## estimates at the middle and end of the estimated -## curves; more information available using summary() -st - -``` - -Plotting by strata (men = blue, women = red): - -```{r} -plot(st, col = c("blue", "red")) -``` - -Note that the correct usage of the `formula` argument in `survtab` specifies the time scale in the `Lexis` object over which survival is computed (here `"FUT"` for follow-up time). This is used to identify the appropriate time scale in the data. When only supplying the survival time scale as the right-hand-side of the formula, the column `lex.Xst` in the supplied `Lexis` object is assumed to be the (correctly formatted!) status variable. When using `Surv()` to be explicit, we effectively (and exceptionally) pass the starting times to the `time` argument in `Surv()`, and `time2` is ignored entirely. The function will fail if `time` does not match exactly with a time scale in data. - -When using `Surv()`, one must also pass the status variable, which can be something other than the `lex.Xst` variable created by `Lexis()`, though usually ``lex.Xst` is what you want to use (especially if the data has already been split using e.g. `splitLexis` or `splitMulti`, which is allowed). It is recommended to use a factor status variable to pass to `Surv()`, though a numeric variable will work in simple cases (0 = alive, 1 = dead; also `FALSE` = alive, `TRUE` = dead). Using `Surv()` also allows easy passing of transformations of `lex.Xst`, e.g. `Surv(FUT, lex.Xst %in% 1:2)`. - -The argument `breaks` must be a named list of breaks by which to split the `Lexis` data (see `?splitMulti`). It is mandatory to assign breaks at least to the survival time scale (`"FUT"` in our example) so that `survtab` knows what intervals to use to estimate the requested survival time function(s). The breaks also determine the window used: It is therefore easy to compute so called period estimates by defining the roof and floor along the calendar time scale, e.g. - -`breaks = list(FUT = seq(0, 5, 1/12), CAL = c(2000, 2005))` - -would cause `survtab` to compute period estimates for 2000-2004 (breaks given here as fractional years, so 2005 is effectively 2004.99999...). - -## Relative/net survival - -Relative/net survival estimation requires knowledge of the expected hazard levels for the individuals in the data. In `survtab` this is accomplished by passing a long-formt `data.frame` of population hazards via the `pophaz` argument. E.g. the `popmort` dataset included in `popEpi` (Finnish overall mortality rates for men and women). - -```{r popmort} -data(popmort) -pm <- data.frame(popmort) -names(pm) <- c("sex", "CAL", "AGE", "haz") -head(pm) -``` - -The `data.frame` should contain a variable named `"haz"` indicating the population hazard at the level of one subject-year. Any other variables are considered to be variables, by which to merge population hazards to the (split) subject-level data within `survtab`. These merging variables may correspond to the time scales in the used `Lexis` object. This allows for e.g. merging in different population hazards for the same subject as they get older. - -The following causes `survtab` to estimate EdererII relative survival: - -```{r survtab_e2} -st.e2 <- survtab(Surv(time = FUT, event = lex.Xst) ~ sex, data = x, - surv.type = "surv.rel", relsurv.method = "e2", - breaks = list(FUT = seq(0, 5, 1/12)), - pophaz = pm) -``` - -```{r} -plot(st.e2, y = "r.e2", col = c("blue", "red")) -``` - -Note that the curves diverge due to merging in the "wrong" population hazards for some individuals which we randomized earlier to be male though all the individuals in data are actually female. Pohar-Perme-weighted estimates can be computed by - -```{r survtab_pp} -st.pp <- survtab(Surv(time = FUT, event = lex.Xst) ~ sex, data = x, - surv.type = "surv.rel", relsurv.method = "pp", - breaks = list(FUT = seq(0, 5, 1/12)), - pophaz = pm) -``` - -Compare with EdererII estimates: - -```{r} -plot(st.e2, y = "r.e2", col = c("blue", "red"), lty = 1) -lines(st.pp, y = "r.pp", col = c("blue", "red"), lty = 2) -``` - -## Adjusting estimates - -`survtab` also allows for adjusting the survival curves by categorical variables --- typically by age groups. The following demonstrates how: - -```{r survtab_adjust} -## an age group variable -x$agegr <- cut(x$dg_age, c(0, 60, 70, 80, Inf), right = FALSE) - -## using "internal weights" - see ?ICSS for international weights standards -w <- table(x$agegr) -w - -w <- list(agegr = as.numeric(w)) -``` - -```{r survtab_adjust_2} -st.as <- survtab(Surv(time = FUT, event = lex.Xst) ~ sex + adjust(agegr), - data = x, weights = w, - surv.type = "surv.rel", relsurv.method = "e2", - breaks = list(FUT = seq(0, 5, 1/12)), - pophaz = pm) -``` - -```{r} -plot(st.as, y = "r.e2.as", col = c("blue", "red")) -``` - -We now have age-adjusted EdererII relative/net survival estimates. The `weights` argument allows for either a list of weigths (with one or multiple variables to adjust by) or a `data.frame` of weights. Examples: - -```{r weights_examples, eval = TRUE} -list(sex = c(0.4, 0.6), agegr = c(0.2, 0.2, 0.4, 0.2)) - -wdf <- merge(0:1, 1:4) -names(wdf) <- c("sex", "agegr") -wdf$weights <- c(0.1, 0.1, 0.1, 0.1, 0.2, 0.2, 0.1, 0.1) -wdf -``` - -The weights do not have to sum to one when supplied as they are internally forced to do so within each stratum. In the `data.frame` of weights, the column of actual weights to use must be named "weights". When there are more than one variable to adjust by, and a list of weights has been supplied, the variable-specific weights are first multiplied together (cumulatively) and then scaled to sum to one. - -This adjusting can be done to any survival time function that `survtab` (and `survtab_ag`) estimates. One can also supply adjusting variables via the `adjust` argument if convenient: - -```{r survtab_adjust_3} -st.as <- survtab(Surv(time = FUT, event = lex.Xst) ~ sex, - adjust = "agegr", - data = x, weights = w, - surv.type = "surv.rel", relsurv.method = "e2", - breaks = list(FUT = seq(0, 5, 1/12)), - pophaz = pm) -``` - -Where `adjust` could also be `adjust = agegr`, `adjust = list(agegr)` or - -`adjust = list(agegr = cut(dg_age, c(0, 60, 70, 80, Inf), right = FALSE))` - -for exactly the same results. When adjusting by multiple variables, one must supply a vector of variable names in data or a list of multiple elements (as in the base function `aggregate`). - -## Other survival time functions - -One can also estimate cause-specific survival functions, cumulative incidence functions (CIFs, a.k.a. crude risk a.k.a. absolute risk functions), and CIFs based on the excess numbers of events. Cause-specific survival is close to net survival as they are philosophically highly similar concepts: - -```{r survtab_cause} -st.ca <- survtab(Surv(time = FUT, event = lex.Xst) ~ 1, - data = x, - surv.type = "surv.cause", - breaks = list(FUT = seq(0, 5, 1/12))) - -st.pp <- survtab(Surv(time = FUT, event = lex.Xst) ~ 1, data = x, - surv.type = "surv.rel", relsurv.method = "pp", - breaks = list(FUT = seq(0, 5, 1/12)), - pophaz = pm) - -plot(st.ca, y = "surv.obs.canD", col = "blue") -lines(st.pp, y = "r.pp", col = "red") -``` - -Absolute risk: - -```{r survtab_cif} -st.cif <- survtab(Surv(time = FUT, event = lex.Xst) ~ 1, - data = x, - surv.type = "cif.obs", - breaks = list(FUT = seq(0, 5, 1/12))) - -plot(st.cif, y = "CIF_canD", conf.int = FALSE) -lines(st.cif, y = "CIF_othD", conf.int = FALSE, col = "red") -``` - -The "relative CIF" attempts to be close to the true CIF without using knowledge about the types of events, e.g. causes of death: - -```{r survtab_relcif} -st.cir <- survtab(Surv(time = FUT, event = lex.Xst) ~ 1, - data = x, - surv.type = "cif.rel", - breaks = list(FUT = seq(0, 5, 1/12)), - pophaz = pm) -plot(st.cif, y = "CIF_canD", conf.int = FALSE, col = "blue") -lines(st.cir, y = "CIF.rel", conf.int = FALSE, col = "red") -``` - - -# Using `survtab_ag` - -Arguments concerning the types and methods of estimating of survival time functions work the same in `survtab_ag` as in `survtab` (the latter uses the former). However, with aggregated data one must explicitly supply the various count and person-time variables. Also, usage of the `formula` argument is different. - -For demonstration purposes we form an aggregated data set using `lexpand`; see `?lexpand` for more information on that function. - -```{r} -sire$sex <- rbinom(nrow(sire), size = 1, prob = 0.5) -ag <- lexpand(sire, birth = "bi_date", entry = "dg_date", exit = "ex_date", - status = "status", breaks = list(fot = seq(0, 5, 1/12)), - aggre = list(sex, fot)) -head(ag) -``` - -Now simply do: - -```{r survtab_ag_example1} -st <- survtab_ag(fot ~ sex, data = ag, surv.type = "surv.obs", - surv.method = "hazard", - d = c("from0to1", "from0to2"), pyrs = "pyrs") -``` - -Or: - -```{r survtab_ag_example2} -st <- survtab_ag(fot ~ sex, data = ag, surv.type = "surv.obs", - surv.method = "lifetable", - d = c("from0to1", "from0to2"), n = "at.risk", - n.cens = "from0to0") -``` - -Note that e.g. argument `d` could also have been supplied as - -`list(from0to1, from0to2)` - -or - -`list(canD = from0to1, othD = from0to2)` - -for identical results. The last is convenient for e.g. `surv.cause` computations: - -```{r survtab_ag_cause} -st.ca <- survtab_ag(fot ~ sex, data = ag, surv.type = "surv.cause", - surv.method = "hazard", - d = list(canD = from0to1, othD = from0to2), pyrs = "pyrs") -plot(st.ca, y = "surv.obs.canD", col = c("blue", "red")) -``` - -One has to supply the most variables when computing Pohar-Perme estimates (though it is probably rare to have third-source aggregated data with Pohar-Perme weighted figures, it is implemented here to be used as a workhorse for `survtab`). For this we must aggregate again to get the Pohar-Perme weighted counts and subject-times: - -```{r} -ag <- lexpand(sire, birth = "bi_date", entry = "dg_date", exit = "ex_date", - status = "status", breaks = list(fot = seq(0, 5, 1/12)), - pophaz = popmort, pp = TRUE, - aggre = list(sex, fot)) - -st.pp <- survtab_ag(fot ~ sex, data = ag, surv.type = "surv.rel", - surv.method = "hazard", relsurv.method = "pp", - d = list(from0to1 + from0to2), pyrs = "pyrs", - d.pp = list(from0to1.pp + from0to2.pp), - d.pp.2 = list(from0to1.pp.2 + from0to2.pp.2), - pyrs.pp = "ptime.pp", d.exp.pp = "d.exp.pp") -plot(st.pp, y = "r.pp", col = c("blue", "red")) -``` - -Here it is best to supply only one column to each argument since Pohar-Perme estimates will not be computed for several types of events at the same time. - - - +--- +title: "Examples of using survtab" +author: "Joonas Miettinen" +date: "`r Sys.Date()`" +output: + html_document: + toc: true + toc_depth: 2 + fig_width: 6 + fig_height: 6 +vignette: > + %\VignetteEngine{knitr::rmarkdown} + %\VignetteIndexEntry{survtab examples} + %\usepackage[utf8]{inputenc} +--- + +# Overview + +This vignette aims to clarify the usage of the `survtab_ag` and `survtab` functions included in this package. `survtab_ag` estimates various survival functions and cumulative incidence functions (CIFs) non-parametrically using aggregated data, and `survtab` is a wrapper for `survtab_ag`, to which `Lexis` data is supplied. + +Two methods (`surv.method`) are currently supported: The `"lifetable"` (actuarial) method only makes use of counts when estimating any of the supported survival time functions. The default method (`"hazard"`}) estimates appropriate hazards and transforms them into survival function or CIF estimates. + +For relative survival estimation we need also to enumerate the expected hazard levels for the subjects in the data. This is done by merging expected hazards to individuals' subintervals (which divide their survival time lines to a number of small intervals). For Pohar-Perme-weighted analyses one must additionally compute various weighted figures at the level of split subject data. + +If one has subject-level data, the simplest way of computing survival function estimates with `popEpi` is by defining a `Lexis` object and using `survtab`, which will do the rest. For pre-aggregated data one may use the `survtab_ag` function instead. One can also use the `lexpand` function to split, merge population hazards, and aggregate in a single function call and then use `survtab_ag` if that is convenient. + +# Using `survtab` + +It is straightforward to estimate various survival time functions with `survtab` once a `Lexis` object has been defined (see `?Lexis` in package `Epi` for details): + +```{r pkgs, eval = TRUE, echo = TRUE, message = FALSE} +library(popEpi) +library(Epi) +library(survival) +``` + +```{r} +data(sire) + +## NOTE: recommended to use factor status variable +x <- Lexis(entry = list(FUT = 0, AGE = dg_age, CAL = get.yrs(dg_date)), + exit = list(CAL = get.yrs(ex_date)), + data = sire[sire$dg_date < sire$ex_date, ], + exit.status = factor(status, levels = 0:2, + labels = c("alive", "canD", "othD")), + merge = TRUE) + +## pretend some are male +set.seed(1L) +x$sex <- rbinom(nrow(x), 1, 0.5) + +## observed survival - explicit method +st <- survtab(Surv(time = FUT, event = lex.Xst) ~ sex, data = x, + surv.type = "surv.obs", + breaks = list(FUT = seq(0, 5, 1/12))) + +## observed survival - easy method (assumes lex.Xst in x is the status variable) +st <- survtab(FUT ~ sex, data = x, + surv.type = "surv.obs", + breaks = list(FUT = seq(0, 5, 1/12))) + +## printing gives the used settings and +## estimates at the middle and end of the estimated +## curves; more information available using summary() +st + +``` + +Plotting by strata (men = blue, women = red): + +```{r} +plot(st, col = c("blue", "red")) +``` + +Note that the correct usage of the `formula` argument in `survtab` specifies the time scale in the `Lexis` object over which survival is computed (here `"FUT"` for follow-up time). This is used to identify the appropriate time scale in the data. When only supplying the survival time scale as the right-hand-side of the formula, the column `lex.Xst` in the supplied `Lexis` object is assumed to be the (correctly formatted!) status variable. When using `Surv()` to be explicit, we effectively (and exceptionally) pass the starting times to the `time` argument in `Surv()`, and `time2` is ignored entirely. The function will fail if `time` does not match exactly with a time scale in data. + +When using `Surv()`, one must also pass the status variable, which can be something other than the `lex.Xst` variable created by `Lexis()`, though usually ``lex.Xst` is what you want to use (especially if the data has already been split using e.g. `splitLexis` or `splitMulti`, which is allowed). It is recommended to use a factor status variable to pass to `Surv()`, though a numeric variable will work in simple cases (0 = alive, 1 = dead; also `FALSE` = alive, `TRUE` = dead). Using `Surv()` also allows easy passing of transformations of `lex.Xst`, e.g. `Surv(FUT, lex.Xst %in% 1:2)`. + +The argument `breaks` must be a named list of breaks by which to split the `Lexis` data (see `?splitMulti`). It is mandatory to assign breaks at least to the survival time scale (`"FUT"` in our example) so that `survtab` knows what intervals to use to estimate the requested survival time function(s). The breaks also determine the window used: It is therefore easy to compute so called period estimates by defining the roof and floor along the calendar time scale, e.g. + +`breaks = list(FUT = seq(0, 5, 1/12), CAL = c(2000, 2005))` + +would cause `survtab` to compute period estimates for 2000-2004 (breaks given here as fractional years, so 2005 is effectively 2004.99999...). + +## Relative/net survival + +Relative/net survival estimation requires knowledge of the expected hazard levels for the individuals in the data. In `survtab` this is accomplished by passing a long-formt `data.frame` of population hazards via the `pophaz` argument. E.g. the `popmort` dataset included in `popEpi` (Finnish overall mortality rates for men and women). + +```{r popmort} +data(popmort) +pm <- data.frame(popmort) +names(pm) <- c("sex", "CAL", "AGE", "haz") +head(pm) +``` + +The `data.frame` should contain a variable named `"haz"` indicating the population hazard at the level of one subject-year. Any other variables are considered to be variables, by which to merge population hazards to the (split) subject-level data within `survtab`. These merging variables may correspond to the time scales in the used `Lexis` object. This allows for e.g. merging in different population hazards for the same subject as they get older. + +The following causes `survtab` to estimate EdererII relative survival: + +```{r survtab_e2} +st.e2 <- survtab(Surv(time = FUT, event = lex.Xst) ~ sex, data = x, + surv.type = "surv.rel", relsurv.method = "e2", + breaks = list(FUT = seq(0, 5, 1/12)), + pophaz = pm) +``` + +```{r} +plot(st.e2, y = "r.e2", col = c("blue", "red")) +``` + +Note that the curves diverge due to merging in the "wrong" population hazards for some individuals which we randomized earlier to be male though all the individuals in data are actually female. Pohar-Perme-weighted estimates can be computed by + +```{r survtab_pp} +st.pp <- survtab(Surv(time = FUT, event = lex.Xst) ~ sex, data = x, + surv.type = "surv.rel", relsurv.method = "pp", + breaks = list(FUT = seq(0, 5, 1/12)), + pophaz = pm) +``` + +Compare with EdererII estimates: + +```{r} +plot(st.e2, y = "r.e2", col = c("blue", "red"), lty = 1) +lines(st.pp, y = "r.pp", col = c("blue", "red"), lty = 2) +``` + +## Adjusting estimates + +`survtab` also allows for adjusting the survival curves by categorical variables --- typically by age groups. The following demonstrates how: + +```{r survtab_adjust} +## an age group variable +x$agegr <- cut(x$dg_age, c(0, 60, 70, 80, Inf), right = FALSE) + +## using "internal weights" - see ?ICSS for international weights standards +w <- table(x$agegr) +w + +w <- list(agegr = as.numeric(w)) +``` + +```{r survtab_adjust_2} +st.as <- survtab(Surv(time = FUT, event = lex.Xst) ~ sex + adjust(agegr), + data = x, weights = w, + surv.type = "surv.rel", relsurv.method = "e2", + breaks = list(FUT = seq(0, 5, 1/12)), + pophaz = pm) +``` + +```{r} +plot(st.as, y = "r.e2.as", col = c("blue", "red")) +``` + +We now have age-adjusted EdererII relative/net survival estimates. The `weights` argument allows for either a list of weigths (with one or multiple variables to adjust by) or a `data.frame` of weights. Examples: + +```{r weights_examples, eval = TRUE} +list(sex = c(0.4, 0.6), agegr = c(0.2, 0.2, 0.4, 0.2)) + +wdf <- merge(0:1, 1:4) +names(wdf) <- c("sex", "agegr") +wdf$weights <- c(0.1, 0.1, 0.1, 0.1, 0.2, 0.2, 0.1, 0.1) +wdf +``` + +The weights do not have to sum to one when supplied as they are internally forced to do so within each stratum. In the `data.frame` of weights, the column of actual weights to use must be named "weights". When there are more than one variable to adjust by, and a list of weights has been supplied, the variable-specific weights are first multiplied together (cumulatively) and then scaled to sum to one. + +This adjusting can be done to any survival time function that `survtab` (and `survtab_ag`) estimates. One can also supply adjusting variables via the `adjust` argument if convenient: + +```{r survtab_adjust_3} +st.as <- survtab(Surv(time = FUT, event = lex.Xst) ~ sex, + adjust = "agegr", + data = x, weights = w, + surv.type = "surv.rel", relsurv.method = "e2", + breaks = list(FUT = seq(0, 5, 1/12)), + pophaz = pm) +``` + +Where `adjust` could also be `adjust = agegr`, `adjust = list(agegr)` or + +`adjust = list(agegr = cut(dg_age, c(0, 60, 70, 80, Inf), right = FALSE))` + +for exactly the same results. When adjusting by multiple variables, one must supply a vector of variable names in data or a list of multiple elements (as in the base function `aggregate`). + +## Other survival time functions + +One can also estimate cause-specific survival functions, cumulative incidence functions (CIFs, a.k.a. crude risk a.k.a. absolute risk functions), and CIFs based on the excess numbers of events. Cause-specific survival is close to net survival as they are philosophically highly similar concepts: + +```{r survtab_cause} +st.ca <- survtab(Surv(time = FUT, event = lex.Xst) ~ 1, + data = x, + surv.type = "surv.cause", + breaks = list(FUT = seq(0, 5, 1/12))) + +st.pp <- survtab(Surv(time = FUT, event = lex.Xst) ~ 1, data = x, + surv.type = "surv.rel", relsurv.method = "pp", + breaks = list(FUT = seq(0, 5, 1/12)), + pophaz = pm) + +plot(st.ca, y = "surv.obs.canD", col = "blue") +lines(st.pp, y = "r.pp", col = "red") +``` + +Absolute risk: + +```{r survtab_cif} +st.cif <- survtab(Surv(time = FUT, event = lex.Xst) ~ 1, + data = x, + surv.type = "cif.obs", + breaks = list(FUT = seq(0, 5, 1/12))) + +plot(st.cif, y = "CIF_canD", conf.int = FALSE) +lines(st.cif, y = "CIF_othD", conf.int = FALSE, col = "red") +``` + +The "relative CIF" attempts to be close to the true CIF without using knowledge about the types of events, e.g. causes of death: + +```{r survtab_relcif} +st.cir <- survtab(Surv(time = FUT, event = lex.Xst) ~ 1, + data = x, + surv.type = "cif.rel", + breaks = list(FUT = seq(0, 5, 1/12)), + pophaz = pm) +plot(st.cif, y = "CIF_canD", conf.int = FALSE, col = "blue") +lines(st.cir, y = "CIF.rel", conf.int = FALSE, col = "red") +``` + + +# Using `survtab_ag` + +Arguments concerning the types and methods of estimating of survival time functions work the same in `survtab_ag` as in `survtab` (the latter uses the former). However, with aggregated data one must explicitly supply the various count and person-time variables. Also, usage of the `formula` argument is different. + +For demonstration purposes we form an aggregated data set using `lexpand`; see `?lexpand` for more information on that function. + +```{r} +sire$sex <- rbinom(nrow(sire), size = 1, prob = 0.5) +ag <- lexpand(sire, birth = "bi_date", entry = "dg_date", exit = "ex_date", + status = "status", breaks = list(fot = seq(0, 5, 1/12)), + aggre = list(sex, fot)) +head(ag) +``` + +Now simply do: + +```{r survtab_ag_example1} +st <- survtab_ag(fot ~ sex, data = ag, surv.type = "surv.obs", + surv.method = "hazard", + d = c("from0to1", "from0to2"), pyrs = "pyrs") +``` + +Or: + +```{r survtab_ag_example2} +st <- survtab_ag(fot ~ sex, data = ag, surv.type = "surv.obs", + surv.method = "lifetable", + d = c("from0to1", "from0to2"), n = "at.risk", + n.cens = "from0to0") +``` + +Note that e.g. argument `d` could also have been supplied as + +`list(from0to1, from0to2)` + +or + +`list(canD = from0to1, othD = from0to2)` + +for identical results. The last is convenient for e.g. `surv.cause` computations: + +```{r survtab_ag_cause} +st.ca <- survtab_ag(fot ~ sex, data = ag, surv.type = "surv.cause", + surv.method = "hazard", + d = list(canD = from0to1, othD = from0to2), pyrs = "pyrs") +plot(st.ca, y = "surv.obs.canD", col = c("blue", "red")) +``` + +One has to supply the most variables when computing Pohar-Perme estimates (though it is probably rare to have third-source aggregated data with Pohar-Perme weighted figures, it is implemented here to be used as a workhorse for `survtab`). For this we must aggregate again to get the Pohar-Perme weighted counts and subject-times: + +```{r} +ag <- lexpand(sire, birth = "bi_date", entry = "dg_date", exit = "ex_date", + status = "status", breaks = list(fot = seq(0, 5, 1/12)), + pophaz = popmort, pp = TRUE, + aggre = list(sex, fot)) + +st.pp <- survtab_ag(fot ~ sex, data = ag, surv.type = "surv.rel", + surv.method = "hazard", relsurv.method = "pp", + d = list(from0to1 + from0to2), pyrs = "pyrs", + d.pp = list(from0to1.pp + from0to2.pp), + d.pp.2 = list(from0to1.pp.2 + from0to2.pp.2), + pyrs.pp = "ptime.pp", d.exp.pp = "d.exp.pp") +plot(st.pp, y = "r.pp", col = c("blue", "red")) +``` + +Here it is best to supply only one column to each argument since Pohar-Perme estimates will not be computed for several types of events at the same time. + + + diff --git a/inst/doc/survtab_examples.html b/inst/doc/survtab_examples.html index 0ed884d..9530c2d 100644 --- a/inst/doc/survtab_examples.html +++ b/inst/doc/survtab_examples.html @@ -1,411 +1,411 @@ - - - - - - - - - - - - - - - -Examples of using survtab - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - - - - - - - - - -
-

Overview

-

This vignette aims to clarify the usage of the survtab_ag and survtab functions included in this package. survtab_ag estimates various survival functions and cumulative incidence functions (CIFs) non-parametrically using aggregated data, and survtab is a wrapper for survtab_ag, to which Lexis data is supplied.

-

Two methods (surv.method) are currently supported: The "lifetable" (actuarial) method only makes use of counts when estimating any of the supported survival time functions. The default method ("hazard"}) estimates appropriate hazards and transforms them into survival function or CIF estimates.

-

For relative survival estimation we need also to enumerate the expected hazard levels for the subjects in the data. This is done by merging expected hazards to individuals’ subintervals (which divide their survival time lines to a number of small intervals). For Pohar-Perme-weighted analyses one must additionally compute various weighted figures at the level of split subject data.

-

If one has subject-level data, the simplest way of computing survival function estimates with popEpi is by defining a Lexis object and using survtab, which will do the rest. For pre-aggregated data one may use the survtab_ag function instead. One can also use the lexpand function to split, merge population hazards, and aggregate in a single function call and then use survtab_ag if that is convenient.

-
-
-

Using survtab

-

It is straightforward to estimate various survival time functions with survtab once a Lexis object has been defined (see ?Lexis in package Epi for details):

-
library(popEpi)
-library(Epi)
-library(survival)
-
data(sire)
-
-## NOTE: recommended to use factor status variable
-x <- Lexis(entry = list(FUT = 0, AGE = dg_age, CAL = get.yrs(dg_date)), 
-           exit = list(CAL = get.yrs(ex_date)), 
-           data = sire[sire$dg_date < sire$ex_date, ],
-           exit.status = factor(status, levels = 0:2, 
-                                labels = c("alive", "canD", "othD")), 
-           merge = TRUE)
-
## NOTE: entry.status has been set to "alive" for all.
-
## pretend some are male
-set.seed(1L)
-x$sex <- rbinom(nrow(x), 1, 0.5)
-
-## observed survival - explicit method
-st <- survtab(Surv(time = FUT, event = lex.Xst) ~ sex, data = x, 
-              surv.type = "surv.obs",
-              breaks = list(FUT = seq(0, 5, 1/12)))
-
-## observed survival - easy method (assumes lex.Xst in x is the status variable)
-st <- survtab(FUT ~ sex, data = x, 
-              surv.type = "surv.obs",
-              breaks = list(FUT = seq(0, 5, 1/12)))
-
-## printing gives the used settings and 
-## estimates at the middle and end of the estimated
-## curves; more information available using summary()
-st
-
## 
-## Call: 
-##  survtab(formula = FUT ~ sex, data = x, breaks = list(FUT = seq(0, 5, 1/12)), surv.type = "surv.obs") 
-## 
-## Type arguments: 
-##  surv.type: surv.obs --- surv.method: hazard
-##  
-## Confidence interval arguments: 
-##  level: 95 % --- transformation: log-log
-##  
-## Totals:
-##  person-time:23993 --- events: 3636
-##  
-## Stratified by: 'sex'
-##    sex Tstop surv.obs.lo surv.obs surv.obs.hi SE.surv.obs
-## 1:   0   2.5      0.6174   0.6328      0.6478    0.007751
-## 2:   0   5.0      0.4962   0.5126      0.5288    0.008321
-## 3:   1   2.5      0.6235   0.6389      0.6539    0.007748
-## 4:   1   5.0      0.5006   0.5171      0.5334    0.008370
-

Plotting by strata (men = blue, women = red):

-
plot(st, col = c("blue", "red"))
-

-

Note that the correct usage of the formula argument in survtab specifies the time scale in the Lexis object over which survival is computed (here "FUT" for follow-up time). This is used to identify the appropriate time scale in the data. When only supplying the survival time scale as the right-hand-side of the formula, the column lex.Xst in the supplied Lexis object is assumed to be the (correctly formatted!) status variable. When using Surv() to be explicit, we effectively (and exceptionally) pass the starting times to the time argument in Surv(), and time2 is ignored entirely. The function will fail if time does not match exactly with a time scale in data.

-

When using Surv(), one must also pass the status variable, which can be something other than the lex.Xst variable created by Lexis(), though usually `lex.Xst is what you want to use (especially if the data has already been split using e.g. splitLexis or splitMulti, which is allowed). It is recommended to use a factor status variable to pass to Surv(), though a numeric variable will work in simple cases (0 = alive, 1 = dead; also FALSE = alive, TRUE = dead). Using Surv() also allows easy passing of transformations of lex.Xst, e.g. Surv(FUT, lex.Xst %in% 1:2).

-

The argument breaks must be a named list of breaks by which to split the Lexis data (see ?splitMulti). It is mandatory to assign breaks at least to the survival time scale ("FUT" in our example) so that survtab knows what intervals to use to estimate the requested survival time function(s). The breaks also determine the window used: It is therefore easy to compute so called period estimates by defining the roof and floor along the calendar time scale, e.g.

-

breaks = list(FUT = seq(0, 5, 1/12), CAL = c(2000, 2005))

-

would cause survtab to compute period estimates for 2000-2004 (breaks given here as fractional years, so 2005 is effectively 2004.99999…).

-
-

Relative/net survival

-

Relative/net survival estimation requires knowledge of the expected hazard levels for the individuals in the data. In survtab this is accomplished by passing a long-formt data.frame of population hazards via the pophaz argument. E.g. the popmort dataset included in popEpi (Finnish overall mortality rates for men and women).

-
data(popmort)
-pm <- data.frame(popmort)
-names(pm) <- c("sex", "CAL", "AGE", "haz")
-head(pm)
-
##   sex  CAL AGE         haz
-## 1   0 1951   0 0.036363176
-## 2   0 1951   1 0.003616547
-## 3   0 1951   2 0.002172384
-## 4   0 1951   3 0.001581249
-## 5   0 1951   4 0.001180690
-## 6   0 1951   5 0.001070595
-

The data.frame should contain a variable named "haz" indicating the population hazard at the level of one subject-year. Any other variables are considered to be variables, by which to merge population hazards to the (split) subject-level data within survtab. These merging variables may correspond to the time scales in the used Lexis object. This allows for e.g. merging in different population hazards for the same subject as they get older.

-

The following causes survtab to estimate EdererII relative survival:

-
st.e2 <- survtab(Surv(time = FUT, event = lex.Xst) ~ sex, data = x, 
-                 surv.type = "surv.rel", relsurv.method = "e2",
-                 breaks = list(FUT = seq(0, 5, 1/12)),
-                 pophaz = pm)
-
plot(st.e2, y = "r.e2", col = c("blue", "red"))
-

-

Note that the curves diverge due to merging in the “wrong” population hazards for some individuals which we randomized earlier to be male though all the individuals in data are actually female. Pohar-Perme-weighted estimates can be computed by

-
st.pp <- survtab(Surv(time = FUT, event = lex.Xst) ~ sex, data = x, 
-                 surv.type = "surv.rel", relsurv.method = "pp",
-                 breaks = list(FUT = seq(0, 5, 1/12)),
-                 pophaz = pm)
-

Compare with EdererII estimates:

-
plot(st.e2, y = "r.e2", col = c("blue", "red"), lty = 1)
-lines(st.pp, y = "r.pp", col = c("blue", "red"), lty = 2)
-

-
-
-

Adjusting estimates

-

survtab also allows for adjusting the survival curves by categorical variables — typically by age groups. The following demonstrates how:

-
## an age group variable
-x$agegr <- cut(x$dg_age, c(0, 60, 70, 80, Inf), right = FALSE)
-
-## using "internal weights" - see ?ICSS for international weights standards
-w <- table(x$agegr)
-w
-
## 
-##   [0,60)  [60,70)  [70,80) [80,Inf) 
-##     1781     1889     2428     2129
-
w <- list(agegr = as.numeric(w))
-
st.as <- survtab(Surv(time = FUT, event = lex.Xst) ~ sex + adjust(agegr), 
-                 data = x, weights = w,
-                 surv.type = "surv.rel", relsurv.method = "e2",
-                 breaks = list(FUT = seq(0, 5, 1/12)),
-                 pophaz = pm)
-
plot(st.as, y = "r.e2.as", col = c("blue", "red"))
-

-

We now have age-adjusted EdererII relative/net survival estimates. The weights argument allows for either a list of weigths (with one or multiple variables to adjust by) or a data.frame of weights. Examples:

-
list(sex = c(0.4, 0.6), agegr = c(0.2, 0.2, 0.4, 0.2))
-
## $sex
-## [1] 0.4 0.6
-## 
-## $agegr
-## [1] 0.2 0.2 0.4 0.2
-
wdf <- merge(0:1, 1:4)
-names(wdf) <- c("sex", "agegr")
-wdf$weights <- c(0.1, 0.1, 0.1, 0.1, 0.2, 0.2, 0.1, 0.1)
-wdf
-
##   sex agegr weights
-## 1   0     1     0.1
-## 2   1     1     0.1
-## 3   0     2     0.1
-## 4   1     2     0.1
-## 5   0     3     0.2
-## 6   1     3     0.2
-## 7   0     4     0.1
-## 8   1     4     0.1
-

The weights do not have to sum to one when supplied as they are internally forced to do so within each stratum. In the data.frame of weights, the column of actual weights to use must be named “weights”. When there are more than one variable to adjust by, and a list of weights has been supplied, the variable-specific weights are first multiplied together (cumulatively) and then scaled to sum to one.

-

This adjusting can be done to any survival time function that survtab (and survtab_ag) estimates. One can also supply adjusting variables via the adjust argument if convenient:

-
st.as <- survtab(Surv(time = FUT, event = lex.Xst) ~ sex, 
-                 adjust = "agegr",
-                 data = x, weights = w,
-                 surv.type = "surv.rel", relsurv.method = "e2",
-                 breaks = list(FUT = seq(0, 5, 1/12)),
-                 pophaz = pm)
-

Where adjust could also be adjust = agegr, adjust = list(agegr) or

-

adjust = list(agegr = cut(dg_age, c(0, 60, 70, 80, Inf), right = FALSE))

-

for exactly the same results. When adjusting by multiple variables, one must supply a vector of variable names in data or a list of multiple elements (as in the base function aggregate).

-
-
-

Other survival time functions

-

One can also estimate cause-specific survival functions, cumulative incidence functions (CIFs, a.k.a. crude risk a.k.a. absolute risk functions), and CIFs based on the excess numbers of events. Cause-specific survival is close to net survival as they are philosophically highly similar concepts:

-
st.ca <- survtab(Surv(time = FUT, event = lex.Xst) ~ 1, 
-                 data = x, 
-                 surv.type = "surv.cause",
-                 breaks = list(FUT = seq(0, 5, 1/12)))
-
-st.pp <- survtab(Surv(time = FUT, event = lex.Xst) ~ 1, data = x, 
-                 surv.type = "surv.rel", relsurv.method = "pp",
-                 breaks = list(FUT = seq(0, 5, 1/12)),
-                 pophaz = pm)
-
-plot(st.ca, y = "surv.obs.canD", col = "blue")
-lines(st.pp, y = "r.pp", col = "red")
-

-

Absolute risk:

-
st.cif <- survtab(Surv(time = FUT, event = lex.Xst) ~ 1, 
-                  data = x, 
-                  surv.type = "cif.obs",
-                  breaks = list(FUT = seq(0, 5, 1/12)))
-
-plot(st.cif, y = "CIF_canD", conf.int = FALSE)
-lines(st.cif, y = "CIF_othD", conf.int = FALSE, col = "red")
-

-

The “relative CIF” attempts to be close to the true CIF without using knowledge about the types of events, e.g. causes of death:

-
st.cir <- survtab(Surv(time = FUT, event = lex.Xst) ~ 1, 
-                  data = x, 
-                  surv.type = "cif.rel",
-                  breaks = list(FUT = seq(0, 5, 1/12)),
-                  pophaz = pm)
-plot(st.cif, y = "CIF_canD", conf.int = FALSE, col = "blue")
-lines(st.cir, y = "CIF.rel", conf.int = FALSE, col = "red")
-

-
-
-
-

Using survtab_ag

-

Arguments concerning the types and methods of estimating of survival time functions work the same in survtab_ag as in survtab (the latter uses the former). However, with aggregated data one must explicitly supply the various count and person-time variables. Also, usage of the formula argument is different.

-

For demonstration purposes we form an aggregated data set using lexpand; see ?lexpand for more information on that function.

-
sire$sex <- rbinom(nrow(sire), size = 1, prob = 0.5)
-ag <- lexpand(sire, birth = "bi_date", entry = "dg_date", exit = "ex_date",
-              status = "status", breaks = list(fot = seq(0, 5, 1/12)), 
-              aggre = list(sex, fot))
-
## dropped 16 rows where entry == exit
-
head(ag)
-
##    sex        fot     pyrs at.risk from0to0 from0to1 from0to2
-## 1:   0 0.00000000 336.9061    4127       12      150       12
-## 2:   0 0.08333333 323.6267    3953       24       94       10
-## 3:   0 0.16666667 313.9685    3825       20       88       17
-## 4:   0 0.25000000 304.8121    3700       19       55       12
-## 5:   0 0.33333333 297.3247    3614       16       69       10
-## 6:   0 0.41666667 290.5048    3519       14       52        9
-

Now simply do:

-
st <- survtab_ag(fot ~ sex, data = ag, surv.type = "surv.obs",
-                 surv.method = "hazard",
-                 d = c("from0to1", "from0to2"), pyrs = "pyrs")
-

Or:

-
st <- survtab_ag(fot ~ sex, data = ag, surv.type = "surv.obs",
-                 surv.method = "lifetable",
-                 d = c("from0to1", "from0to2"), n = "at.risk",
-                 n.cens = "from0to0")
-

Note that e.g. argument d could also have been supplied as

-

list(from0to1, from0to2)

-

or

-

list(canD = from0to1, othD = from0to2)

-

for identical results. The last is convenient for e.g. surv.cause computations:

-
st.ca <- survtab_ag(fot ~ sex, data = ag, surv.type = "surv.cause",
-                    surv.method = "hazard",
-                    d = list(canD = from0to1, othD = from0to2), pyrs = "pyrs")
-plot(st.ca, y = "surv.obs.canD", col = c("blue", "red"))
-

-

One has to supply the most variables when computing Pohar-Perme estimates (though it is probably rare to have third-source aggregated data with Pohar-Perme weighted figures, it is implemented here to be used as a workhorse for survtab). For this we must aggregate again to get the Pohar-Perme weighted counts and subject-times:

-
ag <- lexpand(sire, birth = "bi_date", entry = "dg_date", exit = "ex_date",
-              status = "status", breaks = list(fot = seq(0, 5, 1/12)), 
-              pophaz = popmort, pp = TRUE,
-              aggre = list(sex, fot))
-
## dropped 16 rows where entry == exit
-
## NOTE: 83 rows in split data had values of 'age' higher than max of pophaz's 'agegroup'; the hazard values at 'agegroup' == 100 were used for these
-
st.pp <- survtab_ag(fot ~ sex, data = ag, surv.type = "surv.rel",
-                    surv.method = "hazard", relsurv.method = "pp",
-                    d = list(from0to1 + from0to2), pyrs = "pyrs",
-                    d.pp = list(from0to1.pp + from0to2.pp),
-                    d.pp.2 = list(from0to1.pp.2 + from0to2.pp.2),
-                    pyrs.pp = "ptime.pp", d.exp.pp = "d.exp.pp")
-plot(st.pp, y = "r.pp", col = c("blue", "red"))
-

-

Here it is best to supply only one column to each argument since Pohar-Perme estimates will not be computed for several types of events at the same time.

-
- - - - -
- - - - - - - - + + + + + + + + + + + + + + + +Examples of using survtab + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + +
+

Overview

+

This vignette aims to clarify the usage of the survtab_ag and survtab functions included in this package. survtab_ag estimates various survival functions and cumulative incidence functions (CIFs) non-parametrically using aggregated data, and survtab is a wrapper for survtab_ag, to which Lexis data is supplied.

+

Two methods (surv.method) are currently supported: The "lifetable" (actuarial) method only makes use of counts when estimating any of the supported survival time functions. The default method ("hazard"}) estimates appropriate hazards and transforms them into survival function or CIF estimates.

+

For relative survival estimation we need also to enumerate the expected hazard levels for the subjects in the data. This is done by merging expected hazards to individuals’ subintervals (which divide their survival time lines to a number of small intervals). For Pohar-Perme-weighted analyses one must additionally compute various weighted figures at the level of split subject data.

+

If one has subject-level data, the simplest way of computing survival function estimates with popEpi is by defining a Lexis object and using survtab, which will do the rest. For pre-aggregated data one may use the survtab_ag function instead. One can also use the lexpand function to split, merge population hazards, and aggregate in a single function call and then use survtab_ag if that is convenient.

+
+
+

Using survtab

+

It is straightforward to estimate various survival time functions with survtab once a Lexis object has been defined (see ?Lexis in package Epi for details):

+
library(popEpi)
+library(Epi)
+library(survival)
+
data(sire)
+
+## NOTE: recommended to use factor status variable
+x <- Lexis(entry = list(FUT = 0, AGE = dg_age, CAL = get.yrs(dg_date)), 
+           exit = list(CAL = get.yrs(ex_date)), 
+           data = sire[sire$dg_date < sire$ex_date, ],
+           exit.status = factor(status, levels = 0:2, 
+                                labels = c("alive", "canD", "othD")), 
+           merge = TRUE)
+
## NOTE: entry.status has been set to "alive" for all.
+
## pretend some are male
+set.seed(1L)
+x$sex <- rbinom(nrow(x), 1, 0.5)
+
+## observed survival - explicit method
+st <- survtab(Surv(time = FUT, event = lex.Xst) ~ sex, data = x, 
+              surv.type = "surv.obs",
+              breaks = list(FUT = seq(0, 5, 1/12)))
+
+## observed survival - easy method (assumes lex.Xst in x is the status variable)
+st <- survtab(FUT ~ sex, data = x, 
+              surv.type = "surv.obs",
+              breaks = list(FUT = seq(0, 5, 1/12)))
+
+## printing gives the used settings and 
+## estimates at the middle and end of the estimated
+## curves; more information available using summary()
+st
+
## 
+## Call: 
+##  survtab(formula = FUT ~ sex, data = x, breaks = list(FUT = seq(0, 5, 1/12)), surv.type = "surv.obs") 
+## 
+## Type arguments: 
+##  surv.type: surv.obs --- surv.method: hazard
+##  
+## Confidence interval arguments: 
+##  level: 95 % --- transformation: log-log
+##  
+## Totals:
+##  person-time:23993 --- events: 3636
+##  
+## Stratified by: 'sex'
+##    sex Tstop surv.obs.lo surv.obs surv.obs.hi SE.surv.obs
+## 1:   0   2.5      0.6174   0.6328      0.6478    0.007751
+## 2:   0   5.0      0.4962   0.5126      0.5288    0.008321
+## 3:   1   2.5      0.6235   0.6389      0.6539    0.007748
+## 4:   1   5.0      0.5006   0.5171      0.5334    0.008370
+

Plotting by strata (men = blue, women = red):

+
plot(st, col = c("blue", "red"))
+

+

Note that the correct usage of the formula argument in survtab specifies the time scale in the Lexis object over which survival is computed (here "FUT" for follow-up time). This is used to identify the appropriate time scale in the data. When only supplying the survival time scale as the right-hand-side of the formula, the column lex.Xst in the supplied Lexis object is assumed to be the (correctly formatted!) status variable. When using Surv() to be explicit, we effectively (and exceptionally) pass the starting times to the time argument in Surv(), and time2 is ignored entirely. The function will fail if time does not match exactly with a time scale in data.

+

When using Surv(), one must also pass the status variable, which can be something other than the lex.Xst variable created by Lexis(), though usually `lex.Xst is what you want to use (especially if the data has already been split using e.g. splitLexis or splitMulti, which is allowed). It is recommended to use a factor status variable to pass to Surv(), though a numeric variable will work in simple cases (0 = alive, 1 = dead; also FALSE = alive, TRUE = dead). Using Surv() also allows easy passing of transformations of lex.Xst, e.g. Surv(FUT, lex.Xst %in% 1:2).

+

The argument breaks must be a named list of breaks by which to split the Lexis data (see ?splitMulti). It is mandatory to assign breaks at least to the survival time scale ("FUT" in our example) so that survtab knows what intervals to use to estimate the requested survival time function(s). The breaks also determine the window used: It is therefore easy to compute so called period estimates by defining the roof and floor along the calendar time scale, e.g.

+

breaks = list(FUT = seq(0, 5, 1/12), CAL = c(2000, 2005))

+

would cause survtab to compute period estimates for 2000-2004 (breaks given here as fractional years, so 2005 is effectively 2004.99999…).

+
+

Relative/net survival

+

Relative/net survival estimation requires knowledge of the expected hazard levels for the individuals in the data. In survtab this is accomplished by passing a long-formt data.frame of population hazards via the pophaz argument. E.g. the popmort dataset included in popEpi (Finnish overall mortality rates for men and women).

+
data(popmort)
+pm <- data.frame(popmort)
+names(pm) <- c("sex", "CAL", "AGE", "haz")
+head(pm)
+
##   sex  CAL AGE         haz
+## 1   0 1951   0 0.036363176
+## 2   0 1951   1 0.003616547
+## 3   0 1951   2 0.002172384
+## 4   0 1951   3 0.001581249
+## 5   0 1951   4 0.001180690
+## 6   0 1951   5 0.001070595
+

The data.frame should contain a variable named "haz" indicating the population hazard at the level of one subject-year. Any other variables are considered to be variables, by which to merge population hazards to the (split) subject-level data within survtab. These merging variables may correspond to the time scales in the used Lexis object. This allows for e.g. merging in different population hazards for the same subject as they get older.

+

The following causes survtab to estimate EdererII relative survival:

+
st.e2 <- survtab(Surv(time = FUT, event = lex.Xst) ~ sex, data = x, 
+                 surv.type = "surv.rel", relsurv.method = "e2",
+                 breaks = list(FUT = seq(0, 5, 1/12)),
+                 pophaz = pm)
+
plot(st.e2, y = "r.e2", col = c("blue", "red"))
+

+

Note that the curves diverge due to merging in the “wrong” population hazards for some individuals which we randomized earlier to be male though all the individuals in data are actually female. Pohar-Perme-weighted estimates can be computed by

+
st.pp <- survtab(Surv(time = FUT, event = lex.Xst) ~ sex, data = x, 
+                 surv.type = "surv.rel", relsurv.method = "pp",
+                 breaks = list(FUT = seq(0, 5, 1/12)),
+                 pophaz = pm)
+

Compare with EdererII estimates:

+
plot(st.e2, y = "r.e2", col = c("blue", "red"), lty = 1)
+lines(st.pp, y = "r.pp", col = c("blue", "red"), lty = 2)
+

+
+
+

Adjusting estimates

+

survtab also allows for adjusting the survival curves by categorical variables — typically by age groups. The following demonstrates how:

+
## an age group variable
+x$agegr <- cut(x$dg_age, c(0, 60, 70, 80, Inf), right = FALSE)
+
+## using "internal weights" - see ?ICSS for international weights standards
+w <- table(x$agegr)
+w
+
## 
+##   [0,60)  [60,70)  [70,80) [80,Inf) 
+##     1781     1889     2428     2129
+
w <- list(agegr = as.numeric(w))
+
st.as <- survtab(Surv(time = FUT, event = lex.Xst) ~ sex + adjust(agegr), 
+                 data = x, weights = w,
+                 surv.type = "surv.rel", relsurv.method = "e2",
+                 breaks = list(FUT = seq(0, 5, 1/12)),
+                 pophaz = pm)
+
plot(st.as, y = "r.e2.as", col = c("blue", "red"))
+

+

We now have age-adjusted EdererII relative/net survival estimates. The weights argument allows for either a list of weigths (with one or multiple variables to adjust by) or a data.frame of weights. Examples:

+
list(sex = c(0.4, 0.6), agegr = c(0.2, 0.2, 0.4, 0.2))
+
## $sex
+## [1] 0.4 0.6
+## 
+## $agegr
+## [1] 0.2 0.2 0.4 0.2
+
wdf <- merge(0:1, 1:4)
+names(wdf) <- c("sex", "agegr")
+wdf$weights <- c(0.1, 0.1, 0.1, 0.1, 0.2, 0.2, 0.1, 0.1)
+wdf
+
##   sex agegr weights
+## 1   0     1     0.1
+## 2   1     1     0.1
+## 3   0     2     0.1
+## 4   1     2     0.1
+## 5   0     3     0.2
+## 6   1     3     0.2
+## 7   0     4     0.1
+## 8   1     4     0.1
+

The weights do not have to sum to one when supplied as they are internally forced to do so within each stratum. In the data.frame of weights, the column of actual weights to use must be named “weights”. When there are more than one variable to adjust by, and a list of weights has been supplied, the variable-specific weights are first multiplied together (cumulatively) and then scaled to sum to one.

+

This adjusting can be done to any survival time function that survtab (and survtab_ag) estimates. One can also supply adjusting variables via the adjust argument if convenient:

+
st.as <- survtab(Surv(time = FUT, event = lex.Xst) ~ sex, 
+                 adjust = "agegr",
+                 data = x, weights = w,
+                 surv.type = "surv.rel", relsurv.method = "e2",
+                 breaks = list(FUT = seq(0, 5, 1/12)),
+                 pophaz = pm)
+

Where adjust could also be adjust = agegr, adjust = list(agegr) or

+

adjust = list(agegr = cut(dg_age, c(0, 60, 70, 80, Inf), right = FALSE))

+

for exactly the same results. When adjusting by multiple variables, one must supply a vector of variable names in data or a list of multiple elements (as in the base function aggregate).

+
+
+

Other survival time functions

+

One can also estimate cause-specific survival functions, cumulative incidence functions (CIFs, a.k.a. crude risk a.k.a. absolute risk functions), and CIFs based on the excess numbers of events. Cause-specific survival is close to net survival as they are philosophically highly similar concepts:

+
st.ca <- survtab(Surv(time = FUT, event = lex.Xst) ~ 1, 
+                 data = x, 
+                 surv.type = "surv.cause",
+                 breaks = list(FUT = seq(0, 5, 1/12)))
+
+st.pp <- survtab(Surv(time = FUT, event = lex.Xst) ~ 1, data = x, 
+                 surv.type = "surv.rel", relsurv.method = "pp",
+                 breaks = list(FUT = seq(0, 5, 1/12)),
+                 pophaz = pm)
+
+plot(st.ca, y = "surv.obs.canD", col = "blue")
+lines(st.pp, y = "r.pp", col = "red")
+

+

Absolute risk:

+
st.cif <- survtab(Surv(time = FUT, event = lex.Xst) ~ 1, 
+                  data = x, 
+                  surv.type = "cif.obs",
+                  breaks = list(FUT = seq(0, 5, 1/12)))
+
+plot(st.cif, y = "CIF_canD", conf.int = FALSE)
+lines(st.cif, y = "CIF_othD", conf.int = FALSE, col = "red")
+

+

The “relative CIF” attempts to be close to the true CIF without using knowledge about the types of events, e.g. causes of death:

+
st.cir <- survtab(Surv(time = FUT, event = lex.Xst) ~ 1, 
+                  data = x, 
+                  surv.type = "cif.rel",
+                  breaks = list(FUT = seq(0, 5, 1/12)),
+                  pophaz = pm)
+plot(st.cif, y = "CIF_canD", conf.int = FALSE, col = "blue")
+lines(st.cir, y = "CIF.rel", conf.int = FALSE, col = "red")
+

+
+
+
+

Using survtab_ag

+

Arguments concerning the types and methods of estimating of survival time functions work the same in survtab_ag as in survtab (the latter uses the former). However, with aggregated data one must explicitly supply the various count and person-time variables. Also, usage of the formula argument is different.

+

For demonstration purposes we form an aggregated data set using lexpand; see ?lexpand for more information on that function.

+
sire$sex <- rbinom(nrow(sire), size = 1, prob = 0.5)
+ag <- lexpand(sire, birth = "bi_date", entry = "dg_date", exit = "ex_date",
+              status = "status", breaks = list(fot = seq(0, 5, 1/12)), 
+              aggre = list(sex, fot))
+
## dropped 16 rows where entry == exit
+
head(ag)
+
##    sex        fot     pyrs at.risk from0to0 from0to1 from0to2
+## 1:   0 0.00000000 336.9067    4126       11      142       15
+## 2:   0 0.08333333 324.3659    3958       23       90       15
+## 3:   0 0.16666667 314.8792    3830       14       91       12
+## 4:   0 0.25000000 305.2256    3713       25       69        8
+## 5:   0 0.33333333 296.8944    3611       25       68        9
+## 6:   0 0.41666667 289.5745    3509       14       44       15
+

Now simply do:

+
st <- survtab_ag(fot ~ sex, data = ag, surv.type = "surv.obs",
+                 surv.method = "hazard",
+                 d = c("from0to1", "from0to2"), pyrs = "pyrs")
+

Or:

+
st <- survtab_ag(fot ~ sex, data = ag, surv.type = "surv.obs",
+                 surv.method = "lifetable",
+                 d = c("from0to1", "from0to2"), n = "at.risk",
+                 n.cens = "from0to0")
+

Note that e.g. argument d could also have been supplied as

+

list(from0to1, from0to2)

+

or

+

list(canD = from0to1, othD = from0to2)

+

for identical results. The last is convenient for e.g. surv.cause computations:

+
st.ca <- survtab_ag(fot ~ sex, data = ag, surv.type = "surv.cause",
+                    surv.method = "hazard",
+                    d = list(canD = from0to1, othD = from0to2), pyrs = "pyrs")
+plot(st.ca, y = "surv.obs.canD", col = c("blue", "red"))
+

+

One has to supply the most variables when computing Pohar-Perme estimates (though it is probably rare to have third-source aggregated data with Pohar-Perme weighted figures, it is implemented here to be used as a workhorse for survtab). For this we must aggregate again to get the Pohar-Perme weighted counts and subject-times:

+
ag <- lexpand(sire, birth = "bi_date", entry = "dg_date", exit = "ex_date",
+              status = "status", breaks = list(fot = seq(0, 5, 1/12)), 
+              pophaz = popmort, pp = TRUE,
+              aggre = list(sex, fot))
+
## dropped 16 rows where entry == exit
+
## NOTE: 83 rows in split data had values of 'age' higher than max of pophaz's 'agegroup'; the hazard values at 'agegroup' == 100 were used for these
+
st.pp <- survtab_ag(fot ~ sex, data = ag, surv.type = "surv.rel",
+                    surv.method = "hazard", relsurv.method = "pp",
+                    d = list(from0to1 + from0to2), pyrs = "pyrs",
+                    d.pp = list(from0to1.pp + from0to2.pp),
+                    d.pp.2 = list(from0to1.pp.2 + from0to2.pp.2),
+                    pyrs.pp = "ptime.pp", d.exp.pp = "d.exp.pp")
+plot(st.pp, y = "r.pp", col = c("blue", "red"))
+

+

Here it is best to supply only one column to each argument since Pohar-Perme estimates will not be computed for several types of events at the same time.

+
+ + + + +
+ + + + + + + + diff --git a/man/ICSS.Rd b/man/ICSS.Rd index d2d3568..9709ceb 100644 --- a/man/ICSS.Rd +++ b/man/ICSS.Rd @@ -1,36 +1,36 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/data_document.R -\name{ICSS} -\alias{ICSS} -\title{Age standardisation weights from the ICSS scheme.} -\format{data.table with columns -\itemize{ - \item age - lower bound of the age group - \item ICSS1 - first set of weights, sums to 100 000 - \item ICSS2 - second set of weights, sums to 100 000 - \item ICSS3 - third set of weights, sums to 100 000 -}} -\source{ -\href{http://seer.cancer.gov/stdpopulations/survival.html}{ICSS weights (US National Cancer Institute website)} - -Corazziari, Isabella, Mike Quinn, and Riccardo Capocaccia. "Standard cancer patient population for age standardising survival ratios." European Journal of Cancer 40.15 (2004): 2307-2316. -} -\description{ -Contains three sets age-standardisation weights for age-standardized survival (net, relative or observed). -} -\examples{ -## aggregate weights to a subset of age groups -data(ICSS) -cut <- c(0, 30, 50, 70, Inf) -agegr <- cut(ICSS$age, cut, right = FALSE) -aggregate(ICSS1~agegr, data = ICSS, FUN = sum) -} -\seealso{ -Other popEpi data: \code{\link{meanpop_fi}}, - \code{\link{popmort}}, \code{\link{sibr}}, - \code{\link{sire}}, \code{\link{stdpop101}}, - \code{\link{stdpop18}} - -Other weights: \code{\link{direct_standardization}}, - \code{\link{stdpop101}}, \code{\link{stdpop18}} -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data_document.R +\name{ICSS} +\alias{ICSS} +\title{Age standardisation weights from the ICSS scheme.} +\format{data.table with columns +\itemize{ + \item age - lower bound of the age group + \item ICSS1 - first set of weights, sums to 100 000 + \item ICSS2 - second set of weights, sums to 100 000 + \item ICSS3 - third set of weights, sums to 100 000 +}} +\source{ +\href{http://seer.cancer.gov/stdpopulations/survival.html}{ICSS weights (US National Cancer Institute website)} + +Corazziari, Isabella, Mike Quinn, and Riccardo Capocaccia. "Standard cancer patient population for age standardising survival ratios." European Journal of Cancer 40.15 (2004): 2307-2316. +} +\description{ +Contains three sets age-standardisation weights for age-standardized survival (net, relative or observed). +} +\examples{ +## aggregate weights to a subset of age groups +data(ICSS) +cut <- c(0, 30, 50, 70, Inf) +agegr <- cut(ICSS$age, cut, right = FALSE) +aggregate(ICSS1~agegr, data = ICSS, FUN = sum) +} +\seealso{ +Other popEpi data: \code{\link{meanpop_fi}}, + \code{\link{popmort}}, \code{\link{sibr}}, + \code{\link{sire}}, \code{\link{stdpop101}}, + \code{\link{stdpop18}} + +Other weights: \code{\link{direct_standardization}}, + \code{\link{stdpop101}}, \code{\link{stdpop18}} +} diff --git a/man/Lexis_fpa.Rd b/man/Lexis_fpa.Rd index abda064..61a447c 100644 --- a/man/Lexis_fpa.Rd +++ b/man/Lexis_fpa.Rd @@ -1,82 +1,82 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utility_functions.R -\name{Lexis_fpa} -\alias{Lexis_fpa} -\title{Create a Lexis Object with Follow-up Time, Period, and Age -Time Scales} -\usage{ -Lexis_fpa(data, birth = NULL, entry = NULL, exit = NULL, - entry.status = NULL, exit.status = NULL, subset = NULL, ...) -} -\arguments{ -\item{data}{a \code{data.frame}; mandatory} - -\item{birth}{the time of birth; A character string naming the variable in -data or an expression to evaluate - see -\link[=flexible_argument]{Flexible input}} - -\item{entry}{the time at entry to follow-up; supplied the -same way as \code{birth}} - -\item{exit}{the time at exit from follow-up; supplied the -same way as \code{birth}} - -\item{entry.status}{passed on to \code{\link[Epi]{Lexis}} if not \code{NULL}; -supplied the same way as \code{birth}} - -\item{exit.status}{passed on to \code{\link[Epi]{Lexis}} if not \code{NULL}; -supplied the same way as \code{birth}} - -\item{subset}{a logical condition to subset by before passing data -and arguments to \code{\link[Epi]{Lexis}}} - -\item{...}{additional optional arguments passed on to -\code{\link[Epi]{Lexis}}} -} -\value{ -A \code{Lexis} object with the usual columns that \code{Lexis} objects -have, with time scale columns \code{fot}, \code{per}, and \code{age}. -They are calculated as - -\code{fot = entry - entry} (to ensure correct format, e.g. difftime) - -\code{per = entry} - -and - -\code{age = entry - birth} -} -\description{ -This is a simple wrapper around \code{\link[Epi]{Lexis}} for creating -a \code{Lexis} object with the time scales \code{fot}, \code{per}, -and \code{age}. -} -\examples{ - -data("sire", package = "popEpi") - -lex <- Lexis_fpa(sire, - birth = "bi_date", - entry = dg_date, - exit = ex_date + 1L, - exit.status = "status") - -## some special cases -myVar <- "bi_date" -l <- list(myVar = "bi_date") -sire$l <- sire$myVar <- 1 - -## conflict: myVar taken from data when "bi_date" was intended -lex <- Lexis_fpa(sire, - birth = myVar, - entry = dg_date, - exit = ex_date + 1L, - exit.status = "status") - -## no conflict with names in data -lex <- Lexis_fpa(sire, - birth = l$myVar, - entry = dg_date, - exit = ex_date + 1L, - exit.status = "status") -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utility_functions.R +\name{Lexis_fpa} +\alias{Lexis_fpa} +\title{Create a Lexis Object with Follow-up Time, Period, and Age +Time Scales} +\usage{ +Lexis_fpa(data, birth = NULL, entry = NULL, exit = NULL, + entry.status = NULL, exit.status = NULL, subset = NULL, ...) +} +\arguments{ +\item{data}{a \code{data.frame}; mandatory} + +\item{birth}{the time of birth; A character string naming the variable in +data or an expression to evaluate - see +\link[=flexible_argument]{Flexible input}} + +\item{entry}{the time at entry to follow-up; supplied the +same way as \code{birth}} + +\item{exit}{the time at exit from follow-up; supplied the +same way as \code{birth}} + +\item{entry.status}{passed on to \code{\link[Epi]{Lexis}} if not \code{NULL}; +supplied the same way as \code{birth}} + +\item{exit.status}{passed on to \code{\link[Epi]{Lexis}} if not \code{NULL}; +supplied the same way as \code{birth}} + +\item{subset}{a logical condition to subset by before passing data +and arguments to \code{\link[Epi]{Lexis}}} + +\item{...}{additional optional arguments passed on to +\code{\link[Epi]{Lexis}}} +} +\value{ +A \code{Lexis} object with the usual columns that \code{Lexis} objects +have, with time scale columns \code{fot}, \code{per}, and \code{age}. +They are calculated as + +\code{fot = entry - entry} (to ensure correct format, e.g. difftime) + +\code{per = entry} + +and + +\code{age = entry - birth} +} +\description{ +This is a simple wrapper around \code{\link[Epi]{Lexis}} for creating +a \code{Lexis} object with the time scales \code{fot}, \code{per}, +and \code{age}. +} +\examples{ + +data("sire", package = "popEpi") + +lex <- Lexis_fpa(sire, + birth = "bi_date", + entry = dg_date, + exit = ex_date + 1L, + exit.status = "status") + +## some special cases +myVar <- "bi_date" +l <- list(myVar = "bi_date") +sire$l <- sire$myVar <- 1 + +## conflict: myVar taken from data when "bi_date" was intended +lex <- Lexis_fpa(sire, + birth = myVar, + entry = dg_date, + exit = ex_date + 1L, + exit.status = "status") + +## no conflict with names in data +lex <- Lexis_fpa(sire, + birth = l$myVar, + entry = dg_date, + exit = ex_date + 1L, + exit.status = "status") +} diff --git a/man/RPL.Rd b/man/RPL.Rd index 9d4f60d..5f60a04 100644 --- a/man/RPL.Rd +++ b/man/RPL.Rd @@ -1,21 +1,21 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/relative_poisson_net_survival.R -\docType{data} -\name{RPL} -\alias{RPL} -\title{Relative Poisson family object} -\format{A list very similar to that created by \code{poisson()}.} -\usage{ -RPL -} -\description{ -A family object for GLM fitting of relative Poisson models -} -\seealso{ -Other relpois functions: \code{\link{relpois_ag}}, - \code{\link{relpois}}, \code{\link{rpcurve}} -} -\author{ -Karri Seppa -} -\keyword{datasets} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/relative_poisson_net_survival.R +\docType{data} +\name{RPL} +\alias{RPL} +\title{Relative Poisson family object} +\format{A list very similar to that created by \code{poisson()}.} +\usage{ +RPL +} +\description{ +A family object for GLM fitting of relative Poisson models +} +\seealso{ +Other relpois functions: \code{\link{relpois_ag}}, + \code{\link{relpois}}, \code{\link{rpcurve}} +} +\author{ +Karri Seppa +} +\keyword{datasets} diff --git a/man/adjust.Rd b/man/adjust.Rd index 8ff8267..36ff61f 100644 --- a/man/adjust.Rd +++ b/man/adjust.Rd @@ -1,24 +1,24 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/evaluation.R -\name{adjust} -\alias{adjust} -\title{Adjust Estimates by Categorical Variables} -\usage{ -adjust(...) -} -\arguments{ -\item{...}{variables to adjust by, e.g. \code{adjust(factor(v1), v2, v3)}} -} -\value{ -Returns a list of promises of the variables supplied which can be -evaluated. -} -\description{ -This function is only intended to be used within a formula -when supplied to e.g. \code{\link{survtab_ag}} and should not be -used elsewhere. -} -\examples{ - -y ~ x + adjust(z) -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/evaluation.R +\name{adjust} +\alias{adjust} +\title{Adjust Estimates by Categorical Variables} +\usage{ +adjust(...) +} +\arguments{ +\item{...}{variables to adjust by, e.g. \code{adjust(factor(v1), v2, v3)}} +} +\value{ +Returns a list of promises of the variables supplied which can be +evaluated. +} +\description{ +This function is only intended to be used within a formula +when supplied to e.g. \code{\link{survtab_ag}} and should not be +used elsewhere. +} +\examples{ + +y ~ x + adjust(z) +} diff --git a/man/aggre.Rd b/man/aggre.Rd index 1a816c4..7bef65e 100644 --- a/man/aggre.Rd +++ b/man/aggre.Rd @@ -1,185 +1,185 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/aggregating.R -\name{aggre} -\alias{aggre} -\title{Aggregation of split \code{Lexis} data} -\usage{ -aggre(lex, by = NULL, type = c("unique", "full"), sum.values = NULL, - subset = NULL, verbose = FALSE) -} -\arguments{ -\item{lex}{a \code{Lexis} object split with e.g. -\code{\link[Epi]{splitLexis}} or \code{\link{splitMulti}}} - -\item{by}{variables to tabulate (aggregate) by. -\link[=flexible_argument]{Flexible input}, typically e.g. -\code{by = c("V1", "V2")}. See Details and Examples.} - -\item{type}{determines output levels to which data is aggregated varying -from returning only rows with \code{pyrs > 0} (\code{"unique"}) to -returning all possible combinations of variables given in \code{aggre} even -if those combinations are not represented in data (\code{"full"}); -see Details} - -\item{sum.values}{optional: additional variables to sum by argument - \code{by}. \link[=flexible_argument]{Flexible input}, typically e.g. -\code{sum.values = c("V1", "V2")}} - -\item{subset}{a logical condition to subset by before computations; -e.g. \code{subset = area \%in\% c("A", "B")}} - -\item{verbose}{\code{logical}; if \code{TRUE}, the function returns timings -and some information useful for debugging along the aggregation process} -} -\value{ -A long \code{data.frame} or \code{data.table} of aggregated person-years -(\code{pyrs}), numbers of subjects at risk (\code{at.risk}), and events -formatted \code{fromXtoY}, where \code{X} and \code{X} are states -transitioning from and to or states at the end of each \code{lex.id}'s -follow-up (implying \code{X} = \code{Y}). Subjects at risk are computed -in the beginning of an interval defined by any Lexis time scales and -mentioned in \code{by}, but events occur at any point within an interval. - -When the data has been split along multiple time scales, the last -time scale mentioned in \code{by} is considered to be the survival time -scale with regard to computing events. Time lines cut short by the -extrema of non-survival-time-scales are considered to be censored -("transitions" from the current state to the current state). -} -\description{ -Aggregates a split \code{Lexis} object by given variables -and / or expressions into a long-format table of person-years and -transitions / end-points. Automatic aggregation over time scales -by which data has been split if the respective time scales are mentioned -in the aggregation argument to e.g. intervals of calendar time, follow-up time -and/or age. -} -\details{ -\strong{Basics} - -\code{aggre} is intended for aggregation of split \code{Lexis} data only. -See \code{\link[Epi]{Lexis}} for forming \code{Lexis} objects by hand -and e.g. \code{\link[Epi]{splitLexis}}, \code{\link{splitLexisDT}}, and -\code{\link{splitMulti}} for splitting the data. \code{\link{lexpand}} -may be used for simple data sets to do both steps as well as aggregation -in the same function call. - -Here aggregation refers to computing person-years and the appropriate events -(state transitions and end points in status) for the subjects in the data. -Hence, it computes e.g. deaths (end-point and state transition) and -censorings (end-point) as well as events in a multi-state setting -(state transitions). - -The result is a long-format \code{data.frame} or \code{data.table} -(depending on \code{options("popEpi.datatable")}; see \code{?popEpi}) -with the columns \code{pyrs} and the appropriate transitions named as -\code{fromXtoY}, e.g. \code{from0to0} and \code{from0to1} depending -on the values of \code{lex.Cst} and \code{lex.Xst}. - - -\strong{The by argument} - -The \code{by} argument determines the length of the table, i.e. -the combinations of variables to which data is aggregated. -\code{by} is relatively flexible, as it can be supplied as - -\itemize{ - \item{a character string vector, e.g. \code{c("sex", "area")}, - naming variables existing in \code{lex}} - \item{an expression, e.g. \code{factor(sex, 0:1, c("m", "f"))} - using any variable found in \code{lex}} - \item{a list (fully or partially named) of expressions, e.g. - \code{list(gender = factor(sex, 0:1, c("m", "f"), area)}} -} - -Note that expressions effectively allow a variable to be supplied simply as -e.g. \code{by = sex} (as a symbol/name in R lingo). - -The data is then aggregated to the levels of the given variables -or expression(s). Variables defined to be time scales in the supplied -\code{Lexis} are processed in a special way: If any are mentioned in the -\code{by} argument, intervals of them are formed based on the breaks -used to split the data: e.g. if \code{age} was split using the breaks -\code{c(0, 50, Inf)}, mentioning \code{age} in \code{by} leads to -creating the \code{age} intervals \code{[0, 50)} and \code{[50, Inf)} -and aggregating to them. The intervals are identified in the output -as the lower bounds of the appropriate intervals. - -The order of multiple time scales mentioned in \code{by} matters, -as the last mentioned time scale is assumed to be a survival time scale -for when computing event counts. E.g. when the data is split by the breaks -\code{list(FUT = 0:5, CAL = c(2008,2010))}, time lines cut short at -\code{CAL = 2010} are considered to be censored, but time lines cut short at -\code{FUT = 5} are not. See Return. - -\strong{Aggregation types (styles)} - -It is almost always enough to aggregate the data to variable levels -that are actually represented in the data -(default \code{aggre = "unique"}; alias \code{"non-empty"}). -For certain uses it may be useful -to have also "empty" levels represented (resulting in some rows in output -with zero person-years and events); in these cases supplying -\code{aggre = "full"} (alias \code{"cartesian"}) causes \code{aggre} -to determine the Cartesian product of all the levels of the supplied -\code{by} variables or expressions and aggregate to them. As an example -of a Cartesian product, try - -\code{merge(1:2, 1:5)}. -} -\examples{ - -## form a Lexis object -library(Epi) -data(sibr) -x <- sibr[1:10,] -x[1:5,]$sex <- 0 ## pretend some are male -x <- Lexis(data = x, - entry = list(AGE = dg_age, CAL = get.yrs(dg_date)), - exit = list(CAL = get.yrs(ex_date)), - entry.status=0, exit.status = status) -x <- splitMulti(x, breaks = list(CAL = seq(1993, 2013, 5), - AGE = seq(0, 100, 50))) - -## these produce the same results (with differing ways of determining aggre) -a1 <- aggre(x, by = list(gender = factor(sex, 0:1, c("m", "f")), - agegroup = AGE, period = CAL)) - -a2 <- aggre(x, by = c("sex", "AGE", "CAL")) - -a3 <- aggre(x, by = list(sex, agegroup = AGE, CAL)) - -## returning also empty levels -a4 <- aggre(x, by = c("sex", "AGE", "CAL"), type = "full") - -## computing also expected numbers of cases -x <- lexpand(sibr[1:10,], birth = bi_date, entry = dg_date, - exit = ex_date, status = status \%in\% 1:2, - pophaz = popmort, fot = 0:5, age = c(0, 50, 100)) -x$d.exp <- with(x, lex.dur*pop.haz) -## these produce the same result -a5 <- aggre(x, by = c("sex", "age", "fot"), sum.values = list(d.exp)) -a5 <- aggre(x, by = c("sex", "age", "fot"), sum.values = "d.exp") -a5 <- aggre(x, by = c("sex", "age", "fot"), sum.values = d.exp) -## same result here with custom name -a5 <- aggre(x, by = c("sex", "age", "fot"), - sum.values = list(expCases = d.exp)) - -## computing pohar-perme weighted figures -x$d.exp.pp <- with(x, lex.dur*pop.haz*pp) -a6 <- aggre(x, by = c("sex", "age", "fot"), - sum.values = c("d.exp", "d.exp.pp")) -## or equivalently e.g. sum.values = list(expCases = d.exp, expCases.p = d.exp.pp). -} -\seealso{ -\code{\link{aggregate}} for a similar base R solution, -and \code{\link{ltable}} for a \code{data.table} based aggregator. Neither -are directly applicable to split \code{Lexis} data. - -Other aggregation functions: \code{\link{as.aggre}}, - \code{\link{lexpand}}, \code{\link{setaggre}}, - \code{\link{summary.aggre}} -} -\author{ -Joonas Miettinen -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/aggregating.R +\name{aggre} +\alias{aggre} +\title{Aggregation of split \code{Lexis} data} +\usage{ +aggre(lex, by = NULL, type = c("unique", "full"), sum.values = NULL, + subset = NULL, verbose = FALSE) +} +\arguments{ +\item{lex}{a \code{Lexis} object split with e.g. +\code{\link[Epi]{splitLexis}} or \code{\link{splitMulti}}} + +\item{by}{variables to tabulate (aggregate) by. +\link[=flexible_argument]{Flexible input}, typically e.g. +\code{by = c("V1", "V2")}. See Details and Examples.} + +\item{type}{determines output levels to which data is aggregated varying +from returning only rows with \code{pyrs > 0} (\code{"unique"}) to +returning all possible combinations of variables given in \code{aggre} even +if those combinations are not represented in data (\code{"full"}); +see Details} + +\item{sum.values}{optional: additional variables to sum by argument + \code{by}. \link[=flexible_argument]{Flexible input}, typically e.g. +\code{sum.values = c("V1", "V2")}} + +\item{subset}{a logical condition to subset by before computations; +e.g. \code{subset = area \%in\% c("A", "B")}} + +\item{verbose}{\code{logical}; if \code{TRUE}, the function returns timings +and some information useful for debugging along the aggregation process} +} +\value{ +A long \code{data.frame} or \code{data.table} of aggregated person-years +(\code{pyrs}), numbers of subjects at risk (\code{at.risk}), and events +formatted \code{fromXtoY}, where \code{X} and \code{X} are states +transitioning from and to or states at the end of each \code{lex.id}'s +follow-up (implying \code{X} = \code{Y}). Subjects at risk are computed +in the beginning of an interval defined by any Lexis time scales and +mentioned in \code{by}, but events occur at any point within an interval. + +When the data has been split along multiple time scales, the last +time scale mentioned in \code{by} is considered to be the survival time +scale with regard to computing events. Time lines cut short by the +extrema of non-survival-time-scales are considered to be censored +("transitions" from the current state to the current state). +} +\description{ +Aggregates a split \code{Lexis} object by given variables +and / or expressions into a long-format table of person-years and +transitions / end-points. Automatic aggregation over time scales +by which data has been split if the respective time scales are mentioned +in the aggregation argument to e.g. intervals of calendar time, follow-up time +and/or age. +} +\details{ +\strong{Basics} + +\code{aggre} is intended for aggregation of split \code{Lexis} data only. +See \code{\link[Epi]{Lexis}} for forming \code{Lexis} objects by hand +and e.g. \code{\link[Epi]{splitLexis}}, \code{\link{splitLexisDT}}, and +\code{\link{splitMulti}} for splitting the data. \code{\link{lexpand}} +may be used for simple data sets to do both steps as well as aggregation +in the same function call. + +Here aggregation refers to computing person-years and the appropriate events +(state transitions and end points in status) for the subjects in the data. +Hence, it computes e.g. deaths (end-point and state transition) and +censorings (end-point) as well as events in a multi-state setting +(state transitions). + +The result is a long-format \code{data.frame} or \code{data.table} +(depending on \code{options("popEpi.datatable")}; see \code{?popEpi}) +with the columns \code{pyrs} and the appropriate transitions named as +\code{fromXtoY}, e.g. \code{from0to0} and \code{from0to1} depending +on the values of \code{lex.Cst} and \code{lex.Xst}. + + +\strong{The by argument} + +The \code{by} argument determines the length of the table, i.e. +the combinations of variables to which data is aggregated. +\code{by} is relatively flexible, as it can be supplied as + +\itemize{ + \item{a character string vector, e.g. \code{c("sex", "area")}, + naming variables existing in \code{lex}} + \item{an expression, e.g. \code{factor(sex, 0:1, c("m", "f"))} + using any variable found in \code{lex}} + \item{a list (fully or partially named) of expressions, e.g. + \code{list(gender = factor(sex, 0:1, c("m", "f"), area)}} +} + +Note that expressions effectively allow a variable to be supplied simply as +e.g. \code{by = sex} (as a symbol/name in R lingo). + +The data is then aggregated to the levels of the given variables +or expression(s). Variables defined to be time scales in the supplied +\code{Lexis} are processed in a special way: If any are mentioned in the +\code{by} argument, intervals of them are formed based on the breaks +used to split the data: e.g. if \code{age} was split using the breaks +\code{c(0, 50, Inf)}, mentioning \code{age} in \code{by} leads to +creating the \code{age} intervals \code{[0, 50)} and \code{[50, Inf)} +and aggregating to them. The intervals are identified in the output +as the lower bounds of the appropriate intervals. + +The order of multiple time scales mentioned in \code{by} matters, +as the last mentioned time scale is assumed to be a survival time scale +for when computing event counts. E.g. when the data is split by the breaks +\code{list(FUT = 0:5, CAL = c(2008,2010))}, time lines cut short at +\code{CAL = 2010} are considered to be censored, but time lines cut short at +\code{FUT = 5} are not. See Return. + +\strong{Aggregation types (styles)} + +It is almost always enough to aggregate the data to variable levels +that are actually represented in the data +(default \code{aggre = "unique"}; alias \code{"non-empty"}). +For certain uses it may be useful +to have also "empty" levels represented (resulting in some rows in output +with zero person-years and events); in these cases supplying +\code{aggre = "full"} (alias \code{"cartesian"}) causes \code{aggre} +to determine the Cartesian product of all the levels of the supplied +\code{by} variables or expressions and aggregate to them. As an example +of a Cartesian product, try + +\code{merge(1:2, 1:5)}. +} +\examples{ + +## form a Lexis object +library(Epi) +data(sibr) +x <- sibr[1:10,] +x[1:5,]$sex <- 0 ## pretend some are male +x <- Lexis(data = x, + entry = list(AGE = dg_age, CAL = get.yrs(dg_date)), + exit = list(CAL = get.yrs(ex_date)), + entry.status=0, exit.status = status) +x <- splitMulti(x, breaks = list(CAL = seq(1993, 2013, 5), + AGE = seq(0, 100, 50))) + +## these produce the same results (with differing ways of determining aggre) +a1 <- aggre(x, by = list(gender = factor(sex, 0:1, c("m", "f")), + agegroup = AGE, period = CAL)) + +a2 <- aggre(x, by = c("sex", "AGE", "CAL")) + +a3 <- aggre(x, by = list(sex, agegroup = AGE, CAL)) + +## returning also empty levels +a4 <- aggre(x, by = c("sex", "AGE", "CAL"), type = "full") + +## computing also expected numbers of cases +x <- lexpand(sibr[1:10,], birth = bi_date, entry = dg_date, + exit = ex_date, status = status \%in\% 1:2, + pophaz = popmort, fot = 0:5, age = c(0, 50, 100)) +x$d.exp <- with(x, lex.dur*pop.haz) +## these produce the same result +a5 <- aggre(x, by = c("sex", "age", "fot"), sum.values = list(d.exp)) +a5 <- aggre(x, by = c("sex", "age", "fot"), sum.values = "d.exp") +a5 <- aggre(x, by = c("sex", "age", "fot"), sum.values = d.exp) +## same result here with custom name +a5 <- aggre(x, by = c("sex", "age", "fot"), + sum.values = list(expCases = d.exp)) + +## computing pohar-perme weighted figures +x$d.exp.pp <- with(x, lex.dur*pop.haz*pp) +a6 <- aggre(x, by = c("sex", "age", "fot"), + sum.values = c("d.exp", "d.exp.pp")) +## or equivalently e.g. sum.values = list(expCases = d.exp, expCases.p = d.exp.pp). +} +\seealso{ +\code{\link{aggregate}} for a similar base R solution, +and \code{\link{ltable}} for a \code{data.table} based aggregator. Neither +are directly applicable to split \code{Lexis} data. + +Other aggregation functions: \code{\link{as.aggre}}, + \code{\link{lexpand}}, \code{\link{setaggre}}, + \code{\link{summary.aggre}} +} +\author{ +Joonas Miettinen +} diff --git a/man/all_names_present.Rd b/man/all_names_present.Rd index 6e0c69b..cd5b593 100644 --- a/man/all_names_present.Rd +++ b/man/all_names_present.Rd @@ -1,31 +1,31 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utility_functions.R -\name{all_names_present} -\alias{all_names_present} -\title{Check if all names are present in given data} -\usage{ -all_names_present(data, var.names, stops = TRUE, msg = NULL) -} -\arguments{ -\item{data}{dataset where the variable names should be found} - -\item{var.names}{a character vector of variable names, e.g. -\code{c("var1", "var2")}} - -\item{stops}{logical, stop returns exception} - -\item{msg}{Custom message to return instead of default message. -Special: include \code{\%\%VARS\%\%} in message string and the missing -variable names will be inserted there (quoted, separated by comma, e.g. -\code{'var1'}, \code{'var2'} --- no leading or tracing white space).} -} -\description{ -Given a character vector, checks if all names are present in \code{names(data)}. -Throws error if \code{stops=TRUE}, else returns \code{FALSE} if some variable name is not present. -} -\seealso{ -\code{\link{robust_values}} -} -\author{ -Joonas Miettinen -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utility_functions.R +\name{all_names_present} +\alias{all_names_present} +\title{Check if all names are present in given data} +\usage{ +all_names_present(data, var.names, stops = TRUE, msg = NULL) +} +\arguments{ +\item{data}{dataset where the variable names should be found} + +\item{var.names}{a character vector of variable names, e.g. +\code{c("var1", "var2")}} + +\item{stops}{logical, stop returns exception} + +\item{msg}{Custom message to return instead of default message. +Special: include \code{\%\%VARS\%\%} in message string and the missing +variable names will be inserted there (quoted, separated by comma, e.g. +\code{'var1'}, \code{'var2'} --- no leading or tracing white space).} +} +\description{ +Given a character vector, checks if all names are present in \code{names(data)}. +Throws error if \code{stops=TRUE}, else returns \code{FALSE} if some variable name is not present. +} +\seealso{ +\code{\link{robust_values}} +} +\author{ +Joonas Miettinen +} diff --git a/man/as.Date.yrs.Rd b/man/as.Date.yrs.Rd index 2111d37..0b470c1 100644 --- a/man/as.Date.yrs.Rd +++ b/man/as.Date.yrs.Rd @@ -1,42 +1,42 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/fractional_years.R -\name{as.Date.yrs} -\alias{as.Date.yrs} -\title{Coerce Fractional Year Values to Date Values} -\usage{ -\method{as.Date}{yrs}(x, ...) -} -\arguments{ -\item{x}{an \code{yrs} object created by \code{get.yrs}} - -\item{...}{unused, included for compatibility with other \code{as.Date} -methods} -} -\description{ -Coerces an \code{yrs} object to a \code{Date} object. -Some loss of information comes if \code{year.length = "approx"} -was set when using \code{\link{get.yrs}}, so the transformation back -to \code{Date} will not be perfect there. With \code{year.length = "actual"} -the original values are perfectly retrieved. -} -\examples{ -data("sire", package = "popEpi") - -## approximate year lengths: here 20 \% have an extra day added -sire$dg_yrs <- get.yrs(sire$dg_date) -summary(sire$dg_yrs) -dg_date2 <- as.Date(sire$dg_yrs) -summary(as.numeric(dg_date2 - sire$dg_date)) - -## using actual year lengths -sire$dg_yrs <- get.yrs(sire$dg_date, year.length = "actual") -summary(sire$dg_yrs) -dg_date2 <- as.Date(sire$dg_yrs) -summary(as.numeric(dg_date2 - sire$dg_date)) -} -\seealso{ -\code{\link{get.yrs}} -} -\author{ -Joonas Miettinen -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/fractional_years.R +\name{as.Date.yrs} +\alias{as.Date.yrs} +\title{Coerce Fractional Year Values to Date Values} +\usage{ +\method{as.Date}{yrs}(x, ...) +} +\arguments{ +\item{x}{an \code{yrs} object created by \code{get.yrs}} + +\item{...}{unused, included for compatibility with other \code{as.Date} +methods} +} +\description{ +Coerces an \code{yrs} object to a \code{Date} object. +Some loss of information comes if \code{year.length = "approx"} +was set when using \code{\link{get.yrs}}, so the transformation back +to \code{Date} will not be perfect there. With \code{year.length = "actual"} +the original values are perfectly retrieved. +} +\examples{ +data("sire", package = "popEpi") + +## approximate year lengths: here 20 \% have an extra day added +sire$dg_yrs <- get.yrs(sire$dg_date) +summary(sire$dg_yrs) +dg_date2 <- as.Date(sire$dg_yrs) +summary(as.numeric(dg_date2 - sire$dg_date)) + +## using actual year lengths +sire$dg_yrs <- get.yrs(sire$dg_date, year.length = "actual") +summary(sire$dg_yrs) +dg_date2 <- as.Date(sire$dg_yrs) +summary(as.numeric(dg_date2 - sire$dg_date)) +} +\seealso{ +\code{\link{get.yrs}} +} +\author{ +Joonas Miettinen +} diff --git a/man/as.aggre.Rd b/man/as.aggre.Rd index 50b6964..690a65e 100644 --- a/man/as.aggre.Rd +++ b/man/as.aggre.Rd @@ -1,76 +1,76 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/aggregating.R -\name{as.aggre} -\alias{as.aggre} -\alias{as.aggre.data.frame} -\alias{as.aggre.data.table} -\alias{as.aggre.default} -\title{Coercion to Class \code{aggre}} -\usage{ -as.aggre(x, values = NULL, by = NULL, breaks = NULL, ...) - -\method{as.aggre}{data.frame}(x, values = NULL, by = NULL, breaks = NULL, - ...) - -\method{as.aggre}{data.table}(x, values = NULL, by = NULL, breaks = NULL, - ...) - -\method{as.aggre}{default}(x, ...) -} -\arguments{ -\item{x}{a \code{data.frame} or \code{data.table}} - -\item{values}{a character string vector; the names of value variables} - -\item{by}{a character string vector; the names of variables by which -\code{values} have been tabulated} - -\item{breaks}{a list of breaks, where each element is a breaks vector -as usually passed to e.g. \code{\link{splitLexisDT}}. The list must be -fully named, with the names corresponding to time scales at the aggregate -level in your data. Every unique value in a time scale variable in data must -also exist in the corresponding vector in the breaks list.} - -\item{...}{arguments passed to or from methods} -} -\description{ -Coerces an R object to an \code{aggre} object, identifying -the object as one containing aggregated counts, person-years and other -information. -} -\section{Methods (by class)}{ -\itemize{ -\item \code{data.frame}: Coerces a \code{data.frame} to an \code{aggre} object - -\item \code{data.table}: Coerces a \code{data.table} to an \code{aggre} object - -\item \code{default}: Default method for \code{as.aggre} (stops computations -if no class-specific method found) -}} - -\examples{ -library("data.table") -df <- data.frame(sex = rep(c("male", "female"), each = 5), - obs = rpois(10, rep(7,5, each=5)), - pyrs = rpois(10, lambda = 10000)) -dt <- as.data.table(df) - -df <- as.aggre(df, values = c("pyrs", "obs"), by = "sex") -dt <- as.aggre(dt, values = c("pyrs", "obs"), by = "sex") - -class(df) -class(dt) - -BL <- list(fot = 0:5) -df <- data.frame(df) -df <- as.aggre(df, values = c("pyrs", "obs"), by = "sex", breaks = BL) - -} -\seealso{ -Other aggregation functions: \code{\link{aggre}}, - \code{\link{lexpand}}, \code{\link{setaggre}}, - \code{\link{summary.aggre}} -} -\author{ -Joonas Miettinen -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/aggregating.R +\name{as.aggre} +\alias{as.aggre} +\alias{as.aggre.data.frame} +\alias{as.aggre.data.table} +\alias{as.aggre.default} +\title{Coercion to Class \code{aggre}} +\usage{ +as.aggre(x, values = NULL, by = NULL, breaks = NULL, ...) + +\method{as.aggre}{data.frame}(x, values = NULL, by = NULL, breaks = NULL, + ...) + +\method{as.aggre}{data.table}(x, values = NULL, by = NULL, breaks = NULL, + ...) + +\method{as.aggre}{default}(x, ...) +} +\arguments{ +\item{x}{a \code{data.frame} or \code{data.table}} + +\item{values}{a character string vector; the names of value variables} + +\item{by}{a character string vector; the names of variables by which +\code{values} have been tabulated} + +\item{breaks}{a list of breaks, where each element is a breaks vector +as usually passed to e.g. \code{\link{splitLexisDT}}. The list must be +fully named, with the names corresponding to time scales at the aggregate +level in your data. Every unique value in a time scale variable in data must +also exist in the corresponding vector in the breaks list.} + +\item{...}{arguments passed to or from methods} +} +\description{ +Coerces an R object to an \code{aggre} object, identifying +the object as one containing aggregated counts, person-years and other +information. +} +\section{Methods (by class)}{ +\itemize{ +\item \code{data.frame}: Coerces a \code{data.frame} to an \code{aggre} object + +\item \code{data.table}: Coerces a \code{data.table} to an \code{aggre} object + +\item \code{default}: Default method for \code{as.aggre} (stops computations +if no class-specific method found) +}} + +\examples{ +library("data.table") +df <- data.frame(sex = rep(c("male", "female"), each = 5), + obs = rpois(10, rep(7,5, each=5)), + pyrs = rpois(10, lambda = 10000)) +dt <- as.data.table(df) + +df <- as.aggre(df, values = c("pyrs", "obs"), by = "sex") +dt <- as.aggre(dt, values = c("pyrs", "obs"), by = "sex") + +class(df) +class(dt) + +BL <- list(fot = 0:5) +df <- data.frame(df) +df <- as.aggre(df, values = c("pyrs", "obs"), by = "sex", breaks = BL) + +} +\seealso{ +Other aggregation functions: \code{\link{aggre}}, + \code{\link{lexpand}}, \code{\link{setaggre}}, + \code{\link{summary.aggre}} +} +\author{ +Joonas Miettinen +} diff --git a/man/as.data.frame.ratetable.Rd b/man/as.data.frame.ratetable.Rd index 4db5026..493be59 100644 --- a/man/as.data.frame.ratetable.Rd +++ b/man/as.data.frame.ratetable.Rd @@ -1,33 +1,33 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utility_functions.R -\name{as.data.frame.ratetable} -\alias{as.data.frame.ratetable} -\title{Coerce a \code{ratetable} Object to Class \code{data.frame}} -\usage{ -\method{as.data.frame}{ratetable}(x, ...) -} -\arguments{ -\item{x}{a \code{ratetable}} - -\item{...}{unused but added for compatibility with \code{as.data.frame}} -} -\description{ -\code{ratatable} objects used in e.g. \pkg{survival} and \pkg{relsurv} -can be conveniently coerced to a long-format \code{data.frame}. -However, the names and levels of variables in the result -may not match names and levels of variables in your data. -} -\examples{ -if (requireNamespace("relsurv", quietly = TRUE)) { - data(slopop, package = "relsurv") - df <- as.data.frame(slopop) - head(df) -} -} -\seealso{ -\code{\link[survival]{ratetable}}, -\code{\link{as.data.table.ratetable}} -} -\author{ -Joonas Miettinen -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utility_functions.R +\name{as.data.frame.ratetable} +\alias{as.data.frame.ratetable} +\title{Coerce a \code{ratetable} Object to Class \code{data.frame}} +\usage{ +\method{as.data.frame}{ratetable}(x, ...) +} +\arguments{ +\item{x}{a \code{ratetable}} + +\item{...}{unused but added for compatibility with \code{as.data.frame}} +} +\description{ +\code{ratatable} objects used in e.g. \pkg{survival} and \pkg{relsurv} +can be conveniently coerced to a long-format \code{data.frame}. +However, the names and levels of variables in the result +may not match names and levels of variables in your data. +} +\examples{ +if (requireNamespace("relsurv", quietly = TRUE)) { + data(slopop, package = "relsurv") + df <- as.data.frame(slopop) + head(df) +} +} +\seealso{ +\code{\link[survival]{ratetable}}, +\code{\link{as.data.table.ratetable}} +} +\author{ +Joonas Miettinen +} diff --git a/man/as.data.table.ratetable.Rd b/man/as.data.table.ratetable.Rd index fbf6346..2418f07 100644 --- a/man/as.data.table.ratetable.Rd +++ b/man/as.data.table.ratetable.Rd @@ -1,34 +1,34 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utility_functions.R -\name{as.data.table.ratetable} -\alias{as.data.table.ratetable} -\title{Coerce a \code{ratetable} Object to Class \code{data.table}} -\usage{ -\method{as.data.table}{ratetable}(x, ...) -} -\arguments{ -\item{x}{a \code{ratetable}} - -\item{...}{other arguments passed on to \code{as.data.table}} -} -\description{ -\code{ratatable} objects used in e.g. \pkg{survival} and \pkg{relsurv} -can be conveniently coerced to a long-format \code{data.frame}. -However, the names and levels of variables in the result -may not match names and levels of variables in your data. -} -\examples{ -if (requireNamespace("relsurv", quietly = TRUE)) { - library("data.table") - data(slopop, package = "relsurv") - dt <- as.data.table(slopop) - dt -} -} -\seealso{ -\code{\link[survival]{ratetable}}, -\code{\link{as.data.frame.ratetable}} -} -\author{ -Joonas Miettinen -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utility_functions.R +\name{as.data.table.ratetable} +\alias{as.data.table.ratetable} +\title{Coerce a \code{ratetable} Object to Class \code{data.table}} +\usage{ +\method{as.data.table}{ratetable}(x, ...) +} +\arguments{ +\item{x}{a \code{ratetable}} + +\item{...}{other arguments passed on to \code{as.data.table}} +} +\description{ +\code{ratatable} objects used in e.g. \pkg{survival} and \pkg{relsurv} +can be conveniently coerced to a long-format \code{data.frame}. +However, the names and levels of variables in the result +may not match names and levels of variables in your data. +} +\examples{ +if (requireNamespace("relsurv", quietly = TRUE)) { + library("data.table") + data(slopop, package = "relsurv") + dt <- as.data.table(slopop) + dt +} +} +\seealso{ +\code{\link[survival]{ratetable}}, +\code{\link{as.data.frame.ratetable}} +} +\author{ +Joonas Miettinen +} diff --git a/man/cast_simple.Rd b/man/cast_simple.Rd index b632cb1..bcce0f1 100644 --- a/man/cast_simple.Rd +++ b/man/cast_simple.Rd @@ -1,50 +1,50 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utility_functions.R -\name{cast_simple} -\alias{cast_simple} -\title{Cast \code{data.table}/\code{data.frame} from long format to wide format} -\usage{ -cast_simple(data = NULL, columns = NULL, rows = NULL, values = NULL) -} -\arguments{ -\item{data}{a \code{data.table} or \code{data.frame}} - -\item{columns}{a character string vector; the (unique combinations of the) -levels of these variable will be different rows} - -\item{rows}{a character string vector; the (unique combinations of the) -levels of these variable will be different columns} - -\item{values}{a character string; the variable which will be represented -on rows and columns as specified by \code{columns} and \code{rows}} -} -\description{ -Convenience function for using \code{\link[data.table]{dcast.data.table}} -and \code{\link[reshape2]{dcast}}; -inputs are character strings (names of variables) instead of a formula. -} -\details{ -This function is just a small interface for \code{dcast} / -\code{dcast.data.table} and less flexible than the originals. - -Note that all \code{data.table} objects are also \code{data.frame} -objects, but that each have their own \code{dcast} method. -\code{\link[data.table]{dcast.data.table}} is faster. - -If any values in \code{value.vars} need to be -aggregated, they are aggregated using \code{sum}. -See \code{?dcast}. -} -\examples{ -\dontrun{ -## e.g. silly counts from a long-format table to a wide format -test <- copy(sire) -test$dg_y <- year(test$dg_date) -test$ex_y <- year(test$ex_date) -tab <- ltable(test, c("dg_y","ex_y")) -cast_simple(tab, columns='dg_y', rows="ex_y", values="obs") -} -} -\author{ -Matti Rantanen, Joonas Miettinen -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utility_functions.R +\name{cast_simple} +\alias{cast_simple} +\title{Cast \code{data.table}/\code{data.frame} from long format to wide format} +\usage{ +cast_simple(data = NULL, columns = NULL, rows = NULL, values = NULL) +} +\arguments{ +\item{data}{a \code{data.table} or \code{data.frame}} + +\item{columns}{a character string vector; the (unique combinations of the) +levels of these variable will be different rows} + +\item{rows}{a character string vector; the (unique combinations of the) +levels of these variable will be different columns} + +\item{values}{a character string; the variable which will be represented +on rows and columns as specified by \code{columns} and \code{rows}} +} +\description{ +Convenience function for using \code{\link[data.table]{dcast.data.table}} +and \code{\link[reshape2]{dcast}}; +inputs are character strings (names of variables) instead of a formula. +} +\details{ +This function is just a small interface for \code{dcast} / +\code{dcast.data.table} and less flexible than the originals. + +Note that all \code{data.table} objects are also \code{data.frame} +objects, but that each have their own \code{dcast} method. +\code{\link[data.table]{dcast.data.table}} is faster. + +If any values in \code{value.vars} need to be +aggregated, they are aggregated using \code{sum}. +See \code{?dcast}. +} +\examples{ +\dontrun{ +## e.g. silly counts from a long-format table to a wide format +test <- copy(sire) +test$dg_y <- year(test$dg_date) +test$ex_y <- year(test$ex_date) +tab <- ltable(test, c("dg_y","ex_y")) +cast_simple(tab, columns='dg_y', rows="ex_y", values="obs") +} +} +\author{ +Matti Rantanen, Joonas Miettinen +} diff --git a/man/cut_bound.Rd b/man/cut_bound.Rd index ca4e7cd..4e207be 100644 --- a/man/cut_bound.Rd +++ b/man/cut_bound.Rd @@ -1,28 +1,28 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utility_functions.R -\name{cut_bound} -\alias{cut_bound} -\title{Change output values from cut(..., labels = NULL) output} -\usage{ -cut_bound(t, factor = TRUE) -} -\arguments{ -\item{t}{is a character vector of elements, e.g. "(20,60]"} - -\item{factor}{logical; TRUE returns informative character string, FALSE numeric (left value)} -} -\description{ -Selects lowest values of each factor after cut() based -on the assumption that the value starts from index 2 and end in comma ",". -} -\details{ -type = 'factor': "[50,52)" -> "50-51" OR "[50,51)" -> "50" - -type = 'numeric': lowest bound in numeric. -} -\examples{ -cut_bound("[1900, 1910)") ## "1900-1909" -} -\author{ -Matti Rantanen -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utility_functions.R +\name{cut_bound} +\alias{cut_bound} +\title{Change output values from cut(..., labels = NULL) output} +\usage{ +cut_bound(t, factor = TRUE) +} +\arguments{ +\item{t}{is a character vector of elements, e.g. "(20,60]"} + +\item{factor}{logical; TRUE returns informative character string, FALSE numeric (left value)} +} +\description{ +Selects lowest values of each factor after cut() based +on the assumption that the value starts from index 2 and end in comma ",". +} +\details{ +type = 'factor': "[50,52)" -> "50-51" OR "[50,51)" -> "50" + +type = 'numeric': lowest bound in numeric. +} +\examples{ +cut_bound("[1900, 1910)") ## "1900-1909" +} +\author{ +Matti Rantanen +} diff --git a/man/direct_standardization.Rd b/man/direct_standardization.Rd index 4170d53..07fa16c 100644 --- a/man/direct_standardization.Rd +++ b/man/direct_standardization.Rd @@ -1,154 +1,154 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/direct_adjusting.R -\name{direct_standardization} -\alias{direct_standardization} -\alias{direct_adjusting} -\title{Direct Adjusting in \pkg{popEpi} Using Weights} -\description{ -Several functions in \pkg{popEpi} have support for direct standardization -of estimates. This document explains the usage of weighting with those -functions. -} -\details{ -Direct standardization is performed by computing estimates of -\code{E} -by the set of adjusting variables \code{A}, to which a set of weights -\code{W} is applicable. The weighted average over \code{A} is then the -direct-adjusted estimate of \code{E} (\code{E*}). - -To enable both quick and easy as well as more rigorous usage of direct -standardization with weights, the weights arguments in \pkg{popEpi} -can be supplied in several ways. Ability to use the different -ways depends on the number of adjusting variables. - -The weights are always handled internally to sum to 1, so they do not -need to be scaled in this manner when they are supplied. E.g. -counts of subjects in strata may be passed. -} -\section{Basic usage - one adjusting variable}{ - - -In the simple case where we are adjusting by only one variable -(e.g. by age group), one can simply supply a vector of weights: - -\code{FUN(weights = c(0.1, 0.25, 0.25, 0.2, 0.2))} - -which may be stored in advance: - -\code{w <- c(0.1, 0.25, 0.25, 0.2, 0.2)} - -\code{FUN(weights = w)} - -The order of the weights matters. \pkg{popEpi} functions with direct -adjusting enabled match the supplied weights to the adjusting variables -as follows: If the adjusting variable is a \code{factor}, the order -of the levels is used. Otherwise, the alphabetic order of the unique -values is used (try \code{sort} to see how it works). For clarity -and certainty we recommend using \code{factor} or \code{numeric} variables -when possible. \code{character} variables should be avoided: to see why, -try \code{sort(15:9)} and \code{sort(as.character(15:9))}. - -It is also possible to supply a \code{character} string corresponding -to one of the age group standardization schemes integrated into \pkg{popEpi}: - -\itemize{ -\item \code{'europe_1976_18of5'} - European std. population (1976), 18 age groups -\item \code{'nordic_2000_18of5'} - Nordic std. population (2000), 18 age groups -\item \code{'world_1966_18of5'} - world standard (1966), 18 age groups -\item \code{'world_2000_18of5'} - world standard (2000), 18 age groups -\item \code{'world_2000_20of5'} - world standard (2000), 20 age groups -\item \code{'world_2000_101of1'} - world standard (2000), 101 age groups -} - -Additionally, \code{\link{ICSS}} contains international weights used in -cancer survival analysis, but they are not currently usable by passing -a string to \code{weights} and must be supplied by hand. - -You may also supply \code{weights = "internal"} to use internally -computed weights, i.e. usually simply the counts of subjects / person-time -experienced in each stratum. E.g. - -\code{FUN(weights = "world_2000_18of5")} - -will use the world standard population from 2000 as -weights for 18 age groups, that your adjusting variable is -assumed to contain. The adjusting variable must be coded in this case as -a numeric variable containing \code{1:18} or as a \code{factor} with -18 levels (coded from the youngest to the oldest age group). -} - -\section{More than one adjusting variable}{ - - -In the case that you employ more than one adjusting variable, separate -weights should be passed to match to the levels of the different adjusting -variables. When supplied correctly, "grand" weights are formed based on -the variable-specific weights by multiplying over the variable-specific -weights (e.g. if men have \code{w = 0.5} and the age group 0-4 has -\code{w = 0.1}, the "grand" weight for men aged 0-4 is \code{0.5*0.1}). -The "grand" weights are then used for adjusting after ensuring they -sum to one. - -When using multiple adjusting variables, you -are allowed to pass either a named \code{list} of -weights or a \code{data.frame} of weights. E.g. - -\code{WL <- list(agegroup = age_w, sex = sex_w)} - -\code{FUN(weights = WL)} - -where \code{age_w} and \code{sex_w} are numeric vectors. Given the -conditions explained in the previous section are satisfied, you may also do -e.g. - -\code{WL <- list(agegroup = "world_2000_18of", sex = sex_w)} - -\code{FUN(weights = WL)} - -and the world standard pop is used as weights for the age groups as outlined -in the previous section. - -Sometimes using a \code{data.frame} can be clearer (and it is fool-proof -as well). To do this, form a \code{data.frame} that repeats the levels -of your adjusting variables by each level of every other adjusting variable, -and assign the weights as a column named \code{"weights"}. E.g. - -\code{wdf <- data.frame(sex = rep(0:1, each = 18), agegroup = rep(1:18, 2))} - -\code{wdf$weights <- rbinom(36, size = 100, prob = 0.25)} - -\code{FUN(weights = wdf)} - -If you want to use the counts of subjects in strata as the weights, -one way to do this is by e.g. - -\code{wdf <- as.data.frame(x$V1, x$V2, x$V3)} -\code{names(wdf) <- c("V1", "V2", "V3", "weights")} -} - -\references{ -Source of the Nordic standard population in 5-year age groups -(also contains European & 1966 world standards): -\url{http://www-dep.iarc.fr/NORDCAN/english/glossary.htm} - -Source of the 1976 European standard population: - -Waterhouse, J.,Muir, C.S.,Correa, P.,Powell, J., eds (1976). -Cancer Incidence in Five Continents, Vol. III. -IARC Scientific Publications, No. 15, Lyon, IARC - -A comparison of the 1966 vs. 2000 world standard populations in 5-year age groups: -\url{http://www3.ha.org.hk/cancereg/e_asr.asp} - -Source of 2000 world standard population in 1-year age groups: -\url{http://seer.cancer.gov/stdpopulations/stdpop.singleages.html} -} -\seealso{ -Other weights: \code{\link{ICSS}}, \code{\link{stdpop101}}, - \code{\link{stdpop18}} - -Other popEpi_argument: \code{\link{flexible_argument}} -} -\author{ -Joonas Miettinen -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/direct_adjusting.R +\name{direct_standardization} +\alias{direct_standardization} +\alias{direct_adjusting} +\title{Direct Adjusting in \pkg{popEpi} Using Weights} +\description{ +Several functions in \pkg{popEpi} have support for direct standardization +of estimates. This document explains the usage of weighting with those +functions. +} +\details{ +Direct standardization is performed by computing estimates of +\code{E} +by the set of adjusting variables \code{A}, to which a set of weights +\code{W} is applicable. The weighted average over \code{A} is then the +direct-adjusted estimate of \code{E} (\code{E*}). + +To enable both quick and easy as well as more rigorous usage of direct +standardization with weights, the weights arguments in \pkg{popEpi} +can be supplied in several ways. Ability to use the different +ways depends on the number of adjusting variables. + +The weights are always handled internally to sum to 1, so they do not +need to be scaled in this manner when they are supplied. E.g. +counts of subjects in strata may be passed. +} +\section{Basic usage - one adjusting variable}{ + + +In the simple case where we are adjusting by only one variable +(e.g. by age group), one can simply supply a vector of weights: + +\code{FUN(weights = c(0.1, 0.25, 0.25, 0.2, 0.2))} + +which may be stored in advance: + +\code{w <- c(0.1, 0.25, 0.25, 0.2, 0.2)} + +\code{FUN(weights = w)} + +The order of the weights matters. \pkg{popEpi} functions with direct +adjusting enabled match the supplied weights to the adjusting variables +as follows: If the adjusting variable is a \code{factor}, the order +of the levels is used. Otherwise, the alphabetic order of the unique +values is used (try \code{sort} to see how it works). For clarity +and certainty we recommend using \code{factor} or \code{numeric} variables +when possible. \code{character} variables should be avoided: to see why, +try \code{sort(15:9)} and \code{sort(as.character(15:9))}. + +It is also possible to supply a \code{character} string corresponding +to one of the age group standardization schemes integrated into \pkg{popEpi}: + +\itemize{ +\item \code{'europe_1976_18of5'} - European std. population (1976), 18 age groups +\item \code{'nordic_2000_18of5'} - Nordic std. population (2000), 18 age groups +\item \code{'world_1966_18of5'} - world standard (1966), 18 age groups +\item \code{'world_2000_18of5'} - world standard (2000), 18 age groups +\item \code{'world_2000_20of5'} - world standard (2000), 20 age groups +\item \code{'world_2000_101of1'} - world standard (2000), 101 age groups +} + +Additionally, \code{\link{ICSS}} contains international weights used in +cancer survival analysis, but they are not currently usable by passing +a string to \code{weights} and must be supplied by hand. + +You may also supply \code{weights = "internal"} to use internally +computed weights, i.e. usually simply the counts of subjects / person-time +experienced in each stratum. E.g. + +\code{FUN(weights = "world_2000_18of5")} + +will use the world standard population from 2000 as +weights for 18 age groups, that your adjusting variable is +assumed to contain. The adjusting variable must be coded in this case as +a numeric variable containing \code{1:18} or as a \code{factor} with +18 levels (coded from the youngest to the oldest age group). +} + +\section{More than one adjusting variable}{ + + +In the case that you employ more than one adjusting variable, separate +weights should be passed to match to the levels of the different adjusting +variables. When supplied correctly, "grand" weights are formed based on +the variable-specific weights by multiplying over the variable-specific +weights (e.g. if men have \code{w = 0.5} and the age group 0-4 has +\code{w = 0.1}, the "grand" weight for men aged 0-4 is \code{0.5*0.1}). +The "grand" weights are then used for adjusting after ensuring they +sum to one. + +When using multiple adjusting variables, you +are allowed to pass either a named \code{list} of +weights or a \code{data.frame} of weights. E.g. + +\code{WL <- list(agegroup = age_w, sex = sex_w)} + +\code{FUN(weights = WL)} + +where \code{age_w} and \code{sex_w} are numeric vectors. Given the +conditions explained in the previous section are satisfied, you may also do +e.g. + +\code{WL <- list(agegroup = "world_2000_18of", sex = sex_w)} + +\code{FUN(weights = WL)} + +and the world standard pop is used as weights for the age groups as outlined +in the previous section. + +Sometimes using a \code{data.frame} can be clearer (and it is fool-proof +as well). To do this, form a \code{data.frame} that repeats the levels +of your adjusting variables by each level of every other adjusting variable, +and assign the weights as a column named \code{"weights"}. E.g. + +\code{wdf <- data.frame(sex = rep(0:1, each = 18), agegroup = rep(1:18, 2))} + +\code{wdf$weights <- rbinom(36, size = 100, prob = 0.25)} + +\code{FUN(weights = wdf)} + +If you want to use the counts of subjects in strata as the weights, +one way to do this is by e.g. + +\code{wdf <- as.data.frame(x$V1, x$V2, x$V3)} +\code{names(wdf) <- c("V1", "V2", "V3", "weights")} +} + +\references{ +Source of the Nordic standard population in 5-year age groups +(also contains European & 1966 world standards): +\url{http://www-dep.iarc.fr/NORDCAN/english/glossary.htm} + +Source of the 1976 European standard population: + +Waterhouse, J.,Muir, C.S.,Correa, P.,Powell, J., eds (1976). +Cancer Incidence in Five Continents, Vol. III. +IARC Scientific Publications, No. 15, Lyon, IARC + +A comparison of the 1966 vs. 2000 world standard populations in 5-year age groups: +\url{http://www3.ha.org.hk/cancereg/e_asr.asp} + +Source of 2000 world standard population in 1-year age groups: +\url{http://seer.cancer.gov/stdpopulations/stdpop.singleages.html} +} +\seealso{ +Other weights: \code{\link{ICSS}}, \code{\link{stdpop101}}, + \code{\link{stdpop18}} + +Other popEpi argument evaluation docs: \code{\link{flexible_argument}} +} +\author{ +Joonas Miettinen +} diff --git a/man/fac2num.Rd b/man/fac2num.Rd index 5789f7d..208e386 100644 --- a/man/fac2num.Rd +++ b/man/fac2num.Rd @@ -1,37 +1,37 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utility_functions.R -\name{fac2num} -\alias{fac2num} -\title{Convert factor variable to numeric} -\source{ -\href{http://stackoverflow.com/questions/3418128/how-to-convert-a-factor-to-an-integer-numeric-without-a-loss-of-information}{Stackoverflow thread} -} -\usage{ -fac2num(x) -} -\arguments{ -\item{x}{a factor variable with numbers as levels} -} -\description{ -Convert factor variable with numbers as levels into a numeric variable -} -\details{ -For example, a factor with levels \code{c("5","7")} is converted into -a numeric variable with values \code{c(5,7)}. -} -\examples{ -## this is often not intended -as.numeric(factor(c(5,7))) ## result: c(1,2) -## but this -fac2num(factor(c(5,7))) ## result: c(5,7) - -## however -as.numeric(factor(c("5","7","a"))) ## 1:3 - -fac2num(factor(c("5","7","a"))) ## result: c(5,7,NA) with warning - - -} -\seealso{ -\code{\link{robust_values}} -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utility_functions.R +\name{fac2num} +\alias{fac2num} +\title{Convert factor variable to numeric} +\source{ +\href{http://stackoverflow.com/questions/3418128/how-to-convert-a-factor-to-an-integer-numeric-without-a-loss-of-information}{Stackoverflow thread} +} +\usage{ +fac2num(x) +} +\arguments{ +\item{x}{a factor variable with numbers as levels} +} +\description{ +Convert factor variable with numbers as levels into a numeric variable +} +\details{ +For example, a factor with levels \code{c("5","7")} is converted into +a numeric variable with values \code{c(5,7)}. +} +\examples{ +## this is often not intended +as.numeric(factor(c(5,7))) ## result: c(1,2) +## but this +fac2num(factor(c(5,7))) ## result: c(5,7) + +## however +as.numeric(factor(c("5","7","a"))) ## 1:3 + +fac2num(factor(c("5","7","a"))) ## result: c(5,7,NA) with warning + + +} +\seealso{ +\code{\link{robust_values}} +} diff --git a/man/flexible_argument.Rd b/man/flexible_argument.Rd index 9e7e73c..b9dab63 100644 --- a/man/flexible_argument.Rd +++ b/man/flexible_argument.Rd @@ -1,167 +1,167 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/flexyargs.R -\name{flexible_argument} -\alias{flexible_argument} -\title{Flexible Variable Usage in \pkg{popEpi} Functions} -\description{ -Certain arguments in \pkg{popEpi} can be passed in multiple -ways. This document shows the usage and a pitfall in the -usage of such flexible arguments. -} -\details{ -Flexible arguments in \pkg{popEpi} are used to pass variables existing -in your data or in the environment where the function is used -(for everyday users this is the global environment - in simple terms, -where your data is / your work space). The flexible arguments -are modelled after the \code{by} argument in \code{data.tables} - -see \code{?data.table}. There are many ways to supply the same information -to certain functions in \pkg{popEpi}, but the possible ways listed below -may be limited in some of them to only allow for using only a part of them. -} -\section{Everyday usage}{ - - -Most commonly you may pass -variable names as character strings, e.g. - -\code{FUN(arg = c("V1", "V2"), data = x)} - -which may be stored in advance: - -\code{vars <- c("V1", "V2")} - -\code{FUN(arg = vars, data = x)} - -where \code{x} contains those variables. You may also supply variable -names as symbols: - -\code{FUN(arg = V1, data = x)} - -Or as a list of symbols (similarly to as in \code{\link{aggregate}}): - -\code{FUN(arg = list(V1, V2), data = x)} - -Or as a list of expressions: - -\code{FUN(arg = list(V1 + 1, factor(V2)), data = x)} - -A formula without a left-hand-side specified is sometimes allowed as well: - -\code{FUN(arg = ~ I(V1 + 1) + factor(V2), data = x)} - -Using a symbol or a list of symbols/expressions typically -causes the function to look for the variable(s) -first in the supplied data (if any) and then where the function was called. -For everyday users this means you might define e.g. - -\code{V3 <- factor(letters)} - -and do e.g. - -\code{FUN(arg = list(V1 + 1, factor(V2), V3), data = x)} - -provided \code{V1} and \code{V2} exist in \code{x} or in the function calling -environment. -} - -\section{A pitfall}{ - - -There is one way to use flexible arguments incorrectly: By supplying -the name of a variable which exists both in the supplied data -and the calling environment, and intending the latter to be used. E.g. - -\code{vars <- c("V2")} - -\code{FUN(arg = V3, data = x)} - -where \code{x} has a column named \code{vars}. This causes the function to -use \code{x$vars} and NOT \code{x$V2}. -} - -\section{Advanced}{ - - -Function programmers are advised to pass character strings -whenever possible. To fool-proof against conflicts as described in the -section above, refer to the calling environment explicitly when -passing the variable containing the character strings: - -\code{TF <- environment() ## current env to refer to} - -\code{vars <- c("V1", "V2")} - -\code{FUN(arg = TF$vars, data = x)} - -Even if \code{x} has columns named \code{vars} and \code{TF}, -using \code{TF$vars} does not use those columns but only evaluates -\code{TF$vars} -in the calling environment. This is made possible by the fact -that data is always passed as a \code{data.frame}, within which evaluation -of expressions using the dollar operator is not possible. Therefore -it is safe to assume the data should not be used. However, lists of -expressions will not be checked for dollar use and will fail in conflict -situations: - -\code{TF <- environment() ## current env to refer to} - -\code{vars <- letters[1:5]} - -\code{x <- data.frame(vars = 1:5, TF = 5:1, V1 = 10:6)} - -\code{FUN(arg = list(TF$vars, V1), data = x)} - -On the other hand you may typically also pass quoted (\code{\link{quote}}) -or substituted \code{\link{substitute}} expressions etc., where -the \code{env$object} trick will work as well: - -\code{q <- quote(list(vars, V1))} - -\code{FUN(arg = TF$q, data = x)} - -This works even with - -\code{a <- 1:5} - -\code{V1 <- quote(TF$a)} - -\code{FUN(arg = TF$V1, data = x)} - -So no conflicts should occur. -} - -\examples{ - -data(sire) -## prepare data for e.g. 5-year "period analysis" for 2008-2012 -## note: sire is a simulated cohort integrated into popEpi. -BL <- list(fot=seq(0, 5, by = 1/12)) -x <- lexpand(sire, birth = bi_date, entry = dg_date, exit = ex_date, - status = status \%in\% 1:2, - breaks = BL) - -x <- aggre(x, by = fot) - -## silly example of referring to pyrs data by fixed character string; -## its possible that the real name wont be fixed in a real-life application. -pyrs <- "actual_pyrs" -TF <- environment() -x$actual_pyrs <- as.numeric(x$pyrs) -x$pyrs <- 1 - -## this works (uses actual_pyrs eventually) -st <- survtab_ag(fot ~ 1, data = x, surv.type = "surv.obs", - pyrs = TF$pyrs, d = from0to1, - surv.method = "hazard") -## this would be wrong (sees expression 'pyrs' and uses that column, -## which is not what is intended here) -st <- survtab_ag(fot ~ 1, data = x, surv.type = "surv.obs", - pyrs = pyrs, d = from0to1, - surv.method = "hazard") -} -\seealso{ -Other popEpi_argument: \code{\link{direct_standardization}} -} -\author{ -Joonas Miettinen -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/flexyargs.R +\name{flexible_argument} +\alias{flexible_argument} +\title{Flexible Variable Usage in \pkg{popEpi} Functions} +\description{ +Certain arguments in \pkg{popEpi} can be passed in multiple +ways. This document shows the usage and a pitfall in the +usage of such flexible arguments. +} +\details{ +Flexible arguments in \pkg{popEpi} are used to pass variables existing +in your data or in the environment where the function is used +(for everyday users this is the global environment - in simple terms, +where your data is / your work space). The flexible arguments +are modelled after the \code{by} argument in \code{data.tables} - +see \code{?data.table}. There are many ways to supply the same information +to certain functions in \pkg{popEpi}, but the possible ways listed below +may be limited in some of them to only allow for using only a part of them. +} +\section{Everyday usage}{ + + +Most commonly you may pass +variable names as character strings, e.g. + +\code{FUN(arg = c("V1", "V2"), data = x)} + +which may be stored in advance: + +\code{vars <- c("V1", "V2")} + +\code{FUN(arg = vars, data = x)} + +where \code{x} contains those variables. You may also supply variable +names as symbols: + +\code{FUN(arg = V1, data = x)} + +Or as a list of symbols (similarly to as in \code{\link{aggregate}}): + +\code{FUN(arg = list(V1, V2), data = x)} + +Or as a list of expressions: + +\code{FUN(arg = list(V1 + 1, factor(V2)), data = x)} + +A formula without a left-hand-side specified is sometimes allowed as well: + +\code{FUN(arg = ~ I(V1 + 1) + factor(V2), data = x)} + +Using a symbol or a list of symbols/expressions typically +causes the function to look for the variable(s) +first in the supplied data (if any) and then where the function was called. +For everyday users this means you might define e.g. + +\code{V3 <- factor(letters)} + +and do e.g. + +\code{FUN(arg = list(V1 + 1, factor(V2), V3), data = x)} + +provided \code{V1} and \code{V2} exist in \code{x} or in the function calling +environment. +} + +\section{A pitfall}{ + + +There is one way to use flexible arguments incorrectly: By supplying +the name of a variable which exists both in the supplied data +and the calling environment, and intending the latter to be used. E.g. + +\code{vars <- c("V2")} + +\code{FUN(arg = V3, data = x)} + +where \code{x} has a column named \code{vars}. This causes the function to +use \code{x$vars} and NOT \code{x$V2}. +} + +\section{Advanced}{ + + +Function programmers are advised to pass character strings +whenever possible. To fool-proof against conflicts as described in the +section above, refer to the calling environment explicitly when +passing the variable containing the character strings: + +\code{TF <- environment() ## current env to refer to} + +\code{vars <- c("V1", "V2")} + +\code{FUN(arg = TF$vars, data = x)} + +Even if \code{x} has columns named \code{vars} and \code{TF}, +using \code{TF$vars} does not use those columns but only evaluates +\code{TF$vars} +in the calling environment. This is made possible by the fact +that data is always passed as a \code{data.frame}, within which evaluation +of expressions using the dollar operator is not possible. Therefore +it is safe to assume the data should not be used. However, lists of +expressions will not be checked for dollar use and will fail in conflict +situations: + +\code{TF <- environment() ## current env to refer to} + +\code{vars <- letters[1:5]} + +\code{x <- data.frame(vars = 1:5, TF = 5:1, V1 = 10:6)} + +\code{FUN(arg = list(TF$vars, V1), data = x)} + +On the other hand you may typically also pass quoted (\code{\link{quote}}) +or substituted \code{\link{substitute}} expressions etc., where +the \code{env$object} trick will work as well: + +\code{q <- quote(list(vars, V1))} + +\code{FUN(arg = TF$q, data = x)} + +This works even with + +\code{a <- 1:5} + +\code{V1 <- quote(TF$a)} + +\code{FUN(arg = TF$V1, data = x)} + +So no conflicts should occur. +} + +\examples{ + +data(sire) +## prepare data for e.g. 5-year "period analysis" for 2008-2012 +## note: sire is a simulated cohort integrated into popEpi. +BL <- list(fot=seq(0, 5, by = 1/12)) +x <- lexpand(sire, birth = bi_date, entry = dg_date, exit = ex_date, + status = status \%in\% 1:2, + breaks = BL) + +x <- aggre(x, by = fot) + +## silly example of referring to pyrs data by fixed character string; +## its possible that the real name wont be fixed in a real-life application. +pyrs <- "actual_pyrs" +TF <- environment() +x$actual_pyrs <- as.numeric(x$pyrs) +x$pyrs <- 1 + +## this works (uses actual_pyrs eventually) +st <- survtab_ag(fot ~ 1, data = x, surv.type = "surv.obs", + pyrs = TF$pyrs, d = from0to1, + surv.method = "hazard") +## this would be wrong (sees expression 'pyrs' and uses that column, +## which is not what is intended here) +st <- survtab_ag(fot ~ 1, data = x, surv.type = "surv.obs", + pyrs = pyrs, d = from0to1, + surv.method = "hazard") +} +\seealso{ +Other popEpi argument evaluation docs: \code{\link{direct_standardization}} +} +\author{ +Joonas Miettinen +} diff --git a/man/get.yrs.Rd b/man/get.yrs.Rd index 6421e64..e1f2d84 100644 --- a/man/get.yrs.Rd +++ b/man/get.yrs.Rd @@ -1,78 +1,78 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/fractional_years.R -\name{get.yrs} -\alias{get.yrs} -\title{Convert date objects to fractional years} -\usage{ -get.yrs(x, year.length = "approx", ...) -} -\arguments{ -\item{x}{a \code{Date} object, or anything that \code{link{as.Date}} -accepts} - -\item{year.length}{character string, either \code{'actual'} or -\code{'approx'}; can be abbreviated; see Details} - -\item{...}{additional arguments passed on to \code{\link{as.Date}}; -typically \code{format} when \code{x} is a character string variable, -and \code{origin} when \code{x} is numeric} -} -\description{ -Using Date objects, calculates given -dates as fractional years. -} -\details{ -\code{x} should preferably be a \code{date}, \code{Date} or \code{IDate} -object, although it can also be a character string variable -which is coerced internally to \code{Date} format -using \code{\link{as.Date.character}}. - -When \code{ year.length = 'actual' }, fractional years are calculated as -\code{ year + (day_in_year-1)/365 } for non-leap-years -and as \code{ year + (day_in_year-1)/366 } for leap years. -If \code{ year.length = 'approx' }, fractional years are always -calculated as in \code{ year + (day_in_year-1)/365.242199 }. - -There is a slight difference, then, between the two methods -when calculating durations between fractional years. For -meticulous accuracy one might instead want to calculate durations using -dates (days) and convert the results to fractional years. - -Note that dates are effectively converted to fractional years at -\code{ 00:00:01 } o'clock: - - -\code{ get.yrs("2000-01-01") = 2000 }, and -\code{ get.yrs("2000-01-02") = 2000 + 1/365.242199 }. -} -\examples{ - -data("sire") -sire$dg_yrs <- get.yrs(sire$dg_date) -summary(sire$dg_yrs) - -## see: ?as.Date.yrs -dg_date2 <- as.Date(sire$dg_yrs) -summary(as.numeric(dg_date2 - sire$dg_date)) - -## Epi's cal.yr versus get.yrs -d <- as.Date("2000-01-01") -Epi::cal.yr(d) ## 1999.999 -get.yrs(d) ## 2000 - -## "..." passed on to as.Date, so character / numeric also accepted as input -## (and whatever else as.Date accepts) -get.yrs("2000-06-01") -get.yrs("20000601", format = "\%Y\%m\%d") -get.yrs("1/6/00", format = "\%d/\%m/\%y") - -get.yrs(100, origin = "1970-01-01") - - -} -\seealso{ -\code{\link[Epi]{cal.yr}}, \code{\link{as.Date.yrs}}, \code{\link{as.Date}} -} -\author{ -Joonas Miettinen -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/fractional_years.R +\name{get.yrs} +\alias{get.yrs} +\title{Convert date objects to fractional years} +\usage{ +get.yrs(x, year.length = "approx", ...) +} +\arguments{ +\item{x}{a \code{Date} object, or anything that \code{link{as.Date}} +accepts} + +\item{year.length}{character string, either \code{'actual'} or +\code{'approx'}; can be abbreviated; see Details} + +\item{...}{additional arguments passed on to \code{\link{as.Date}}; +typically \code{format} when \code{x} is a character string variable, +and \code{origin} when \code{x} is numeric} +} +\description{ +Using Date objects, calculates given +dates as fractional years. +} +\details{ +\code{x} should preferably be a \code{date}, \code{Date} or \code{IDate} +object, although it can also be a character string variable +which is coerced internally to \code{Date} format +using \code{\link{as.Date.character}}. + +When \code{ year.length = 'actual' }, fractional years are calculated as +\code{ year + (day_in_year-1)/365 } for non-leap-years +and as \code{ year + (day_in_year-1)/366 } for leap years. +If \code{ year.length = 'approx' }, fractional years are always +calculated as in \code{ year + (day_in_year-1)/365.242199 }. + +There is a slight difference, then, between the two methods +when calculating durations between fractional years. For +meticulous accuracy one might instead want to calculate durations using +dates (days) and convert the results to fractional years. + +Note that dates are effectively converted to fractional years at +\code{ 00:00:01 } o'clock: + + +\code{ get.yrs("2000-01-01") = 2000 }, and +\code{ get.yrs("2000-01-02") = 2000 + 1/365.242199 }. +} +\examples{ + +data("sire") +sire$dg_yrs <- get.yrs(sire$dg_date) +summary(sire$dg_yrs) + +## see: ?as.Date.yrs +dg_date2 <- as.Date(sire$dg_yrs) +summary(as.numeric(dg_date2 - sire$dg_date)) + +## Epi's cal.yr versus get.yrs +d <- as.Date("2000-01-01") +Epi::cal.yr(d) ## 1999.999 +get.yrs(d) ## 2000 + +## "..." passed on to as.Date, so character / numeric also accepted as input +## (and whatever else as.Date accepts) +get.yrs("2000-06-01") +get.yrs("20000601", format = "\%Y\%m\%d") +get.yrs("1/6/00", format = "\%d/\%m/\%y") + +get.yrs(100, origin = "1970-01-01") + + +} +\seealso{ +\code{\link[Epi]{cal.yr}}, \code{\link{as.Date.yrs}}, \code{\link{as.Date}} +} +\author{ +Joonas Miettinen +} diff --git a/man/is.Date.Rd b/man/is.Date.Rd index f9ce84c..442f186 100644 --- a/man/is.Date.Rd +++ b/man/is.Date.Rd @@ -1,41 +1,41 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utility_functions.R -\name{is.Date} -\alias{is.Date} -\title{Test if object is a \code{Date} object} -\usage{ -is.Date(obj) -} -\arguments{ -\item{obj}{object to test on} -} -\description{ -Tests if an object is a \code{Date} object and returns -a logical vector of length 1. \code{IDate} objects are also -\code{Date} objects, but \code{date} objects from package \pkg{date} -are not. -} -\examples{ -## the base "capital Date" format -da <- as.Date("2000-01-01") -is.Date(da) ## TRUE -date::is.date(da) ## FALSE - -## IDate format from data.table -library("data.table") -da <- as.IDate("2000-01-01") -is.Date(da) ## TRUE -date::is.date(da) ## FALSE - -## from package "date" -da <- date::as.date("1jan2000") -is.Date(da) ## FALSE -date::is.date(da) ## TRUE - -} -\seealso{ -\code{\link{get.yrs}}, \code{\link{is_leap_year}}, \code{\link{as.Date}} -} -\author{ -Joonas Miettinen -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utility_functions.R +\name{is.Date} +\alias{is.Date} +\title{Test if object is a \code{Date} object} +\usage{ +is.Date(obj) +} +\arguments{ +\item{obj}{object to test on} +} +\description{ +Tests if an object is a \code{Date} object and returns +a logical vector of length 1. \code{IDate} objects are also +\code{Date} objects, but \code{date} objects from package \pkg{date} +are not. +} +\examples{ +## the base "capital Date" format +da <- as.Date("2000-01-01") +is.Date(da) ## TRUE +date::is.date(da) ## FALSE + +## IDate format from data.table +library("data.table") +da <- as.IDate("2000-01-01") +is.Date(da) ## TRUE +date::is.date(da) ## FALSE + +## from package "date" +da <- date::as.date("1jan2000") +is.Date(da) ## FALSE +date::is.date(da) ## TRUE + +} +\seealso{ +\code{\link{get.yrs}}, \code{\link{is_leap_year}}, \code{\link{as.Date}} +} +\author{ +Joonas Miettinen +} diff --git a/man/is_leap_year.Rd b/man/is_leap_year.Rd index 485f951..7b31e02 100644 --- a/man/is_leap_year.Rd +++ b/man/is_leap_year.Rd @@ -1,28 +1,28 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utility_functions.R -\name{is_leap_year} -\alias{is_leap_year} -\title{Detect leap years} -\usage{ -is_leap_year(years) -} -\arguments{ -\item{years}{a vector or column of year values (numeric or integer)} -} -\description{ -Given a vector or column of year values (numeric or integer), \code{\link{is_leap_year}} returns a vector of equal length -of logical indicators, i.e. a vector where corresponding leap years have value TRUE, and FALSE otherwise. -} -\examples{ -## can be used to assign new columns easily, e.g. a dummy indicator column -df <- data.frame(yrs=c(1900,1904,2005,1995)) -df$lyd <- as.integer(is_leap_year(df$yrs)) - -## mostly it is useful as a condition or to indicate which rows have leap years -which(is_leap_year(df$yrs)) # 2 -df[is_leap_year(df$yrs),] # 2nd row - -} -\author{ -Joonas Miettinen -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utility_functions.R +\name{is_leap_year} +\alias{is_leap_year} +\title{Detect leap years} +\usage{ +is_leap_year(years) +} +\arguments{ +\item{years}{a vector or column of year values (numeric or integer)} +} +\description{ +Given a vector or column of year values (numeric or integer), \code{\link{is_leap_year}} returns a vector of equal length +of logical indicators, i.e. a vector where corresponding leap years have value TRUE, and FALSE otherwise. +} +\examples{ +## can be used to assign new columns easily, e.g. a dummy indicator column +df <- data.frame(yrs=c(1900,1904,2005,1995)) +df$lyd <- as.integer(is_leap_year(df$yrs)) + +## mostly it is useful as a condition or to indicate which rows have leap years +which(is_leap_year(df$yrs)) # 2 +df[is_leap_year(df$yrs),] # 2nd row + +} +\author{ +Joonas Miettinen +} diff --git a/man/lexpand.Rd b/man/lexpand.Rd index 5ee5fb7..e752a1a 100644 --- a/man/lexpand.Rd +++ b/man/lexpand.Rd @@ -1,368 +1,368 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/lexpand.R -\name{lexpand} -\alias{lexpand} -\title{Split case-level observations} -\usage{ -lexpand(data, birth = NULL, entry = NULL, exit = NULL, event = NULL, - status = status != 0, entry.status = NULL, breaks = list(fot = c(0, - Inf)), id = NULL, overlapping = TRUE, aggre = NULL, - aggre.type = c("unique", "cartesian"), drop = TRUE, pophaz = NULL, - pp = TRUE, subset = NULL, merge = TRUE, verbose = FALSE, ...) -} -\arguments{ -\item{data}{dataset of e.g. cancer cases as rows} - -\item{birth}{birth time in date format -or fractional years; string, symbol or expression} - -\item{entry}{entry time in date format -or fractional years; string, symbol or expression} - -\item{exit}{exit from follow-up time in date -format or fractional years; string, symbol or expression} - -\item{event}{advanced: time of possible event differing from \code{exit}; -typically only used in certain SIR/SMR calculations - see Details; -string, symbol or expression} - -\item{status}{variable indicating type of event at \code{exit} or \code{event}; -e.g. \code{status = status != 0}; expression or quoted variable name} - -\item{entry.status}{input in the same way as \code{status}; -status at \code{entry}; see Details} - -\item{breaks}{a named list of vectors of time breaks; -e.g. \code{breaks = list(fot=0:5, age=c(0,45,65,Inf))}; see Details} - -\item{id}{optional; an id variable; e.g. \code{id = my_id}; -string, symbol or expression} - -\item{overlapping}{advanced, logical; if \code{FALSE} AND if \code{data} contains -multiple rows per subject, -ensures that the timelines of \code{id}-specific rows do not overlap; -this ensures e.g. that person-years are only computed once per subject -in a multi-state paradigm} - -\item{aggre}{e.g. \code{aggre = list(sex, fot)}; -a list of unquoted variables and/or expressions thereof, -which are interpreted as factors; data events and person-years will -be aggregated by the unique combinations of these; see Details} - -\item{aggre.type}{one of \code{c("unique","cartesian")}; -can be abbreviated; see Details} - -\item{drop}{logical; if \code{TRUE}, drops all resulting rows -after splitting that reside outside -the time window as defined by the given breaks (all time scales)} - -\item{pophaz}{a dataset of population hazards to merge -with split data; see Details} - -\item{pp}{logical; if \code{TRUE}, computes Pohar-Perme weights using -\code{pophaz}; adds variable with reserved name \code{pp}; -see Details for computing method} - -\item{subset}{a logical vector or any logical condition; data is subsetted -before splitting accordingly} - -\item{merge}{logical; if \code{TRUE}, retains all -original variables from the data} - -\item{verbose}{logical; if \code{TRUE}, the function is chatty and -returns some messages along the way} - -\item{...}{e.g. \code{fot = 0:5}; instead of specifying a \code{breaks} list, -correctly named breaks vectors can be given -for \code{fot}, \code{age}, and \code{per}; these override any breaks in the -\code{breaks} list; see Examples} -} -\value{ -If \code{aggre = NULL}, returns -a \code{data.table} or \code{data.frame} -(depending on \code{options("popEpi.datatable")}; see \code{?popEpi}) -object expanded to accommodate split observations with time scales as -fractional years and \code{pophaz} merged in if given. Population -hazard levels in new variable \code{pop.haz}, and Pohar-Perme -weights as new variable \code{pp} if requested. - -If \code{aggre} is defined, returns a long-format -\code{data.table}/\code{data.frame} with the variable \code{pyrs} (person-years), -and variables for the counts of transitions in state or state at end of -follow-up formatted \code{fromXtoY}, where \code{X} and \code{Y} are -the states transitioned from and to, respectively. The data may also have -the columns \code{d.exp} for expected numbers of cases and various -Pohar-Perme weighted figures as identified by the suffix \code{.pp}; see -Details. -} -\description{ -Given subject-level data, data is split -by calendar time (\code{per}), \code{age}, and follow-up -time (\code{fot}, from 0 to the end of follow-up) -into subject-time-interval rows according to -given \code{breaks} and additionally processed if requested. -} -\details{ -\strong{Basics} - -\code{\link{lexpand}} splits a given data set (with e.g. cancer diagnoses -as rows) to subintervals of time over -calendar time, age, and follow-up time with given time breaks -using \code{\link{splitMulti}}. - -The dataset must contain appropriate -\code{Date} / \code{IDate} / \code{date} format or -other numeric variables that can be used -as the time variables. - -You may take a look at a simulated cohort -\code{\link{sire}} as an example of the -minimum required information for processing data with \code{lexpand}. - -Many arguments can be supplied as a character string naming the appropriate -variable (e.g. \code{"sex"}), as a symbol (e.g. \code{sex}) or as an expression -(e.g. \code{factor(sex, 0:1, c("m", "f"))}) for flexibility. - -\strong{Breaks} - -You should define all breaks as left inclusive and right exclusive -time points (e.g.\code{[a,b)} ) -for 1-3 time dimensions so that the last member of a breaks vector -is a meaningful "final upper limit", - e.g. \code{per = c(2002,2007,2012)} -to create a last subinterval of the form \code{[2007,2012)}. - -All breaks are explicit, i.e. if \code{drop = TRUE}, -any data beyond the outermost breaks points are dropped. -If one wants to have unspecified upper / lower limits on one time scale, -use \code{Inf}: e.g. \code{breaks = list(fot = 0:5, age = c(0,45,Inf))}. -Breaks for \code{per} can also be given in -\code{Date}/\code{IDate}/\code{date} format, whereupon -they are converted to fractional years before used in splitting. - -The \code{age} time scale can additionally -be automatically split into common age grouping schemes -by naming the scheme with an appropriate character string: - -\itemize{ - \item \code{"18of5"}: age groups 0-4, 5-9, 10-14, ..., 75-79, 80-84, 85+ - \item \code{"20of5"}: age groups 0-4, 5-9, 10-14, ..., 85-89, 90-94, 95+ - \item \code{"101of1"}: age groups 0, 1, 2, ..., 98, 99, 100+ -} - -\strong{Time variables} - -If any of the given time variables -(\code{birth}, \code{entry}, \code{exit}, \code{event}) -is in any kind of date format, they are first coerced to -fractional years before splitting -using \code{\link{get.yrs}} (with \code{year.length = "actual"}). - -Sometimes in e.g. SIR/SMR calculation one may want the event time to differ -from the time of exit from follow-up, if the subject is still considered -to be at risk of the event. If \code{event} is specified, the transition to - \code{status} is moved to \code{event} from \code{exit} - using \code{\link[Epi]{cutLexis}}. See Examples. - -\strong{The status variable} - -The statuses in the expanded output (\code{lex.Cst} and \code{lex.Xst}) -are determined by using either only \code{status} or both \code{status} -and \code{entry.status}. If \code{entry.status = NULL}, the status at entry -is guessed according to the type of variable supplied via \code{status}: -For numeric variables it will be zero, for factors the first level -(\code{levels(status)[1]}) and otherwise the first unique value in alphabetical -order (\code{sort(unique(status))[1]}). - -Using numeric or factor status -variables is strongly recommended. Logical expressions are also allowed -(e.g. \code{status = my_status != 0L}) and are converted to integer internally. - -\strong{Merging population hazard information} - -To enable computing relative/net survivals with \code{\link{survtab}} -and \code{\link{relpois}}, \code{lexpand} merges an appropriate -population hazard data (\code{pophaz}) to the expanded data -before dropping rows outside the specified -time window (if \code{drop = TRUE}). \code{pophaz} must, for this reason, -contain at a minimum the variables named -\code{agegroup}, \code{year}, and \code{haz}. \code{pophaz} may contain additional variables to specify -different population hazard levels in different strata; e.g. \code{popmort} includes \code{sex}. -All the strata-defining variables must be present in the supplied \code{data}. \code{lexpand} will -automatically detect variables with common names in the two datasets and merge using them. - -Currently \code{year} must be an integer variable specifying the appropriate year. \code{agegroup} -must currently also specify one-year age groups, e.g. \code{popmort} specifies 101 age groups -of length 1 year. In both -\code{year} and \code{agegroup} variables the values are interpreted as the lower bounds of intervals -(and passed on to a \code{cut} call). The mandatory variable \code{haz} -must specify the appropriate average rate at the person-year level; -e.g. \code{haz = -log(survProb)} where \code{survProb} is a one-year conditional -survival probability will be the correct hazard specification. - -The corresponding \code{pophaz} population hazard value is merged by using the mid points -of the records after splitting as reference values. E.g. if \code{age=89.9} at the start -of a 1-year interval, then the reference age value is \code{90.4} for merging. -This way we get a "typical" population hazard level for each record. - -\strong{Computing Pohar-Perme weights} - -If \code{pp = TRUE}, Pohar-Perme weights -(the inverse of cumulative population survival) are computed. This will -create the new \code{pp} variable in the expanded data. \code{pp} is a -reserved name and \code{lexpand} throws exception if a variable with that name -exists in \code{data}. - -When a survival interval contains one or several rows per subject -(e.g. due to splitting by the \code{per} scale), -\code{pp} is cumulated from the beginning of the first record in a survival -interval for each subject to the mid-point of the remaining time within that -survival interval, and that value is given for every other record -that a given person has within the same survival interval. - -E.g. with 5 rows of duration \code{1/5} within a survival interval -\code{[0,1)]}, \code{pp} is determined for all records by a cumulative -population survival from \code{0} to \code{0.5}. Th existing accuracy is used, -so that the weight is cumulated first up to the end of the second row -and then over the remaining distance to the mid-point (first to 0.4, then to -0.5). This ensures that more accurately merged population hazards are fully -used. - -\strong{Event not at end of follow-up & overlapping time lines} - -\code{event} may be used if the event indicated by \code{status} should -occur at a time differing from \code{exit}. If \code{event} is defined, -\code{cutLexis} is used on the data set after coercing it to the \code{Lexis} -format and before splitting. Note that some values of \code{event} are allowed -to be \code{NA} as with \code{cutLexis} to accommodate observations -without an event occurring. - -Additionally, setting \code{overlapping = FALSE} ensures that (irrespective -of using \code{event}) the each subject defined by \code{id} only has one -continuous time line instead of possibly overlapping time lines if -there are multiple rows in \code{data} by \code{id}. - - -\strong{Aggregating} - -Certain analyses such as SIR/SMR calculations require tables of events and -person-years by the unique combinations (interactions) of several variables. -For this, \code{aggre} can be specified as a list of such variables -(preferably \code{factor} variables but not mandatory) - and any arbitrary functions of the -variables at one's disposal. E.g. - -\code{aggre = list(sex, agegr = cut(dg_age, 0:100))} - -would tabulate events and person-years by sex and an ad-hoc age group -variable. Every ad-hoc-created variable should be named. - -\code{fot}, \code{per}, and \code{age} are special reserved variables which, -when present in the \code{aggre} list, are output as categories of the -corresponding time scale variables by using -e.g. - -\code{cut(fot, breaks$fot, right=FALSE)}. - -This only works if -the corresponding breaks are defined in \code{breaks} or via "\code{...}". -E.g. - -\code{aggre = list(sex, fot.int = fot)} with - -\code{breaks = list(fot=0:5)}. - -The output variable \code{fot.int} in the above example will have -the lower limits of the appropriate intervals as values. - -\code{aggre} as a named list will output numbers of events and person-years -with the given new names as categorizing variable names, e.g. -\code{aggre = list(follow_up = fot, gender = sex, agegroup = age)}. - -The output table has person-years (\code{pyrs}) and event counts -(e.g. \code{from0to1}) as columns. Event counts are the numbers of transitions -(\code{lex.Cst != lex.Xst}) or the \code{lex.Xst} value at a subject's -last record (subject possibly defined by \code{id}). - -If \code{aggre.type = "unique"} (alias \code{"non-empty"}), -the above results are computed for existing -combinations of expressions given in \code{aggre}, but also for non-existing -combinations if \code{aggre.type = "cartesian"} (alias \code{"full"}). E.g. if a -factor variable has levels \code{"a", "b", "c"} but the data is limited -to only have levels \code{"a", "b"} present -(more than zero rows have these level values), the former setting only -computes results for \code{"a", "b"}, and the latter also for \code{"c"} -and any combination with other variables or expression given in \code{aggre}. -In essence, \code{"cartesian"} forces also combinations of variables used -in \code{aggre} that have no match in data to be shown in the result. - -If \code{aggre} is not \code{NULL} and \code{pophaz} has been supplied, -\code{lexpand} also aggregates the expected counts of events, which -appears in the output data by the reserved name \code{d.exp}. Additionally, -having \code{pp = TRUE} causes \code{lexpand} to also compute various -Pohar-Perme weighted figures necessary for computing Pohar-Perme net survivals -with \code{\link{survtab_ag}}. This can be slow, so consider what is really -needed. The Pohar-Perme weighted figures have the suffix \code{.pp}. -} -\examples{ -\dontrun{ -## prepare data for e.g. 5-year cohort survival calculation -x <- lexpand(sire, breaks=list(fot=seq(0, 5, by = 1/12)), - birth = bi_date, entry = dg_date, exit = ex_date, - status = status != 0, pophaz=popmort) - -## prepare data for e.g. 5-year "period analysis" for 2008-2012 -BL <- list(fot = seq(0, 5, by = 1/12), per = c("2008-01-01", "2013-01-01")) -x <- lexpand(sire, breaks = BL, - birth = bi_date, entry = dg_date, exit = ex_date, - pophaz=popmort, status = status != 0) - -## aggregating -BL <- list(fot = 0:5, per = c("2003-01-01","2008-01-01", "2013-01-01")) -ag <- lexpand(sire, breaks = BL, status = status != 0, - birth = bi_date, entry = dg_date, exit = ex_date, - aggre=list(sex, period = per, surv.int = fot)) - -## aggregating even more -ag <- lexpand(sire, breaks = BL, status = status != 0, - birth = bi_date, entry = dg_date, exit = ex_date, - aggre=list(sex, period = per, surv.int = fot), - pophaz = popmort, pp = TRUE) - -## using "..." -x <- lexpand(sire, fot=0:5, status = status != 0, - birth = bi_date, entry = dg_date, exit = ex_date, - pophaz=popmort) - -x <- lexpand(sire, fot=0:5, status = status != 0, - birth = bi_date, entry = dg_date, exit = ex_date, - aggre=list(sex, surv.int = fot)) - -## using the "event" argument: it just places the transition to given "status" -## at the "event" time instead of at the end, if possible using cutLexis -x <- lexpand(sire, status = status, event = dg_date, - birth = bi_date, entry = dg_date, exit = ex_date,) - -## aggregating with custom "event" time -## (the transition to status is moved to the "event" time) -x <- lexpand(sire, status = status, event = dg_date, - birth = bi_date, entry = dg_date, exit = ex_date, - per = 1970:2014, age = c(0:100,Inf), - aggre = list(sex, year = per, agegroup = age)) - -} - -} -\seealso{ -\code{\link[Epi]{Lexis}}, \code{\link{popmort}} - -Other splitting functions: \code{\link{splitLexisDT}}, - \code{\link{splitMulti}} - -Other aggregation functions: \code{\link{aggre}}, - \code{\link{as.aggre}}, \code{\link{setaggre}}, - \code{\link{summary.aggre}} -} -\author{ -Joonas Miettinen -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lexpand.R +\name{lexpand} +\alias{lexpand} +\title{Split case-level observations} +\usage{ +lexpand(data, birth = NULL, entry = NULL, exit = NULL, event = NULL, + status = status != 0, entry.status = NULL, breaks = list(fot = c(0, + Inf)), id = NULL, overlapping = TRUE, aggre = NULL, + aggre.type = c("unique", "cartesian"), drop = TRUE, pophaz = NULL, + pp = TRUE, subset = NULL, merge = TRUE, verbose = FALSE, ...) +} +\arguments{ +\item{data}{dataset of e.g. cancer cases as rows} + +\item{birth}{birth time in date format +or fractional years; string, symbol or expression} + +\item{entry}{entry time in date format +or fractional years; string, symbol or expression} + +\item{exit}{exit from follow-up time in date +format or fractional years; string, symbol or expression} + +\item{event}{advanced: time of possible event differing from \code{exit}; +typically only used in certain SIR/SMR calculations - see Details; +string, symbol or expression} + +\item{status}{variable indicating type of event at \code{exit} or \code{event}; +e.g. \code{status = status != 0}; expression or quoted variable name} + +\item{entry.status}{input in the same way as \code{status}; +status at \code{entry}; see Details} + +\item{breaks}{a named list of vectors of time breaks; +e.g. \code{breaks = list(fot=0:5, age=c(0,45,65,Inf))}; see Details} + +\item{id}{optional; an id variable; e.g. \code{id = my_id}; +string, symbol or expression} + +\item{overlapping}{advanced, logical; if \code{FALSE} AND if \code{data} contains +multiple rows per subject, +ensures that the timelines of \code{id}-specific rows do not overlap; +this ensures e.g. that person-years are only computed once per subject +in a multi-state paradigm} + +\item{aggre}{e.g. \code{aggre = list(sex, fot)}; +a list of unquoted variables and/or expressions thereof, +which are interpreted as factors; data events and person-years will +be aggregated by the unique combinations of these; see Details} + +\item{aggre.type}{one of \code{c("unique","cartesian")}; +can be abbreviated; see Details} + +\item{drop}{logical; if \code{TRUE}, drops all resulting rows +after splitting that reside outside +the time window as defined by the given breaks (all time scales)} + +\item{pophaz}{a dataset of population hazards to merge +with split data; see Details} + +\item{pp}{logical; if \code{TRUE}, computes Pohar-Perme weights using +\code{pophaz}; adds variable with reserved name \code{pp}; +see Details for computing method} + +\item{subset}{a logical vector or any logical condition; data is subsetted +before splitting accordingly} + +\item{merge}{logical; if \code{TRUE}, retains all +original variables from the data} + +\item{verbose}{logical; if \code{TRUE}, the function is chatty and +returns some messages along the way} + +\item{...}{e.g. \code{fot = 0:5}; instead of specifying a \code{breaks} list, +correctly named breaks vectors can be given +for \code{fot}, \code{age}, and \code{per}; these override any breaks in the +\code{breaks} list; see Examples} +} +\value{ +If \code{aggre = NULL}, returns +a \code{data.table} or \code{data.frame} +(depending on \code{options("popEpi.datatable")}; see \code{?popEpi}) +object expanded to accommodate split observations with time scales as +fractional years and \code{pophaz} merged in if given. Population +hazard levels in new variable \code{pop.haz}, and Pohar-Perme +weights as new variable \code{pp} if requested. + +If \code{aggre} is defined, returns a long-format +\code{data.table}/\code{data.frame} with the variable \code{pyrs} (person-years), +and variables for the counts of transitions in state or state at end of +follow-up formatted \code{fromXtoY}, where \code{X} and \code{Y} are +the states transitioned from and to, respectively. The data may also have +the columns \code{d.exp} for expected numbers of cases and various +Pohar-Perme weighted figures as identified by the suffix \code{.pp}; see +Details. +} +\description{ +Given subject-level data, data is split +by calendar time (\code{per}), \code{age}, and follow-up +time (\code{fot}, from 0 to the end of follow-up) +into subject-time-interval rows according to +given \code{breaks} and additionally processed if requested. +} +\details{ +\strong{Basics} + +\code{\link{lexpand}} splits a given data set (with e.g. cancer diagnoses +as rows) to subintervals of time over +calendar time, age, and follow-up time with given time breaks +using \code{\link{splitMulti}}. + +The dataset must contain appropriate +\code{Date} / \code{IDate} / \code{date} format or +other numeric variables that can be used +as the time variables. + +You may take a look at a simulated cohort +\code{\link{sire}} as an example of the +minimum required information for processing data with \code{lexpand}. + +Many arguments can be supplied as a character string naming the appropriate +variable (e.g. \code{"sex"}), as a symbol (e.g. \code{sex}) or as an expression +(e.g. \code{factor(sex, 0:1, c("m", "f"))}) for flexibility. + +\strong{Breaks} + +You should define all breaks as left inclusive and right exclusive +time points (e.g.\code{[a,b)} ) +for 1-3 time dimensions so that the last member of a breaks vector +is a meaningful "final upper limit", + e.g. \code{per = c(2002,2007,2012)} +to create a last subinterval of the form \code{[2007,2012)}. + +All breaks are explicit, i.e. if \code{drop = TRUE}, +any data beyond the outermost breaks points are dropped. +If one wants to have unspecified upper / lower limits on one time scale, +use \code{Inf}: e.g. \code{breaks = list(fot = 0:5, age = c(0,45,Inf))}. +Breaks for \code{per} can also be given in +\code{Date}/\code{IDate}/\code{date} format, whereupon +they are converted to fractional years before used in splitting. + +The \code{age} time scale can additionally +be automatically split into common age grouping schemes +by naming the scheme with an appropriate character string: + +\itemize{ + \item \code{"18of5"}: age groups 0-4, 5-9, 10-14, ..., 75-79, 80-84, 85+ + \item \code{"20of5"}: age groups 0-4, 5-9, 10-14, ..., 85-89, 90-94, 95+ + \item \code{"101of1"}: age groups 0, 1, 2, ..., 98, 99, 100+ +} + +\strong{Time variables} + +If any of the given time variables +(\code{birth}, \code{entry}, \code{exit}, \code{event}) +is in any kind of date format, they are first coerced to +fractional years before splitting +using \code{\link{get.yrs}} (with \code{year.length = "actual"}). + +Sometimes in e.g. SIR/SMR calculation one may want the event time to differ +from the time of exit from follow-up, if the subject is still considered +to be at risk of the event. If \code{event} is specified, the transition to + \code{status} is moved to \code{event} from \code{exit} + using \code{\link[Epi]{cutLexis}}. See Examples. + +\strong{The status variable} + +The statuses in the expanded output (\code{lex.Cst} and \code{lex.Xst}) +are determined by using either only \code{status} or both \code{status} +and \code{entry.status}. If \code{entry.status = NULL}, the status at entry +is guessed according to the type of variable supplied via \code{status}: +For numeric variables it will be zero, for factors the first level +(\code{levels(status)[1]}) and otherwise the first unique value in alphabetical +order (\code{sort(unique(status))[1]}). + +Using numeric or factor status +variables is strongly recommended. Logical expressions are also allowed +(e.g. \code{status = my_status != 0L}) and are converted to integer internally. + +\strong{Merging population hazard information} + +To enable computing relative/net survivals with \code{\link{survtab}} +and \code{\link{relpois}}, \code{lexpand} merges an appropriate +population hazard data (\code{pophaz}) to the expanded data +before dropping rows outside the specified +time window (if \code{drop = TRUE}). \code{pophaz} must, for this reason, +contain at a minimum the variables named +\code{agegroup}, \code{year}, and \code{haz}. \code{pophaz} may contain additional variables to specify +different population hazard levels in different strata; e.g. \code{popmort} includes \code{sex}. +All the strata-defining variables must be present in the supplied \code{data}. \code{lexpand} will +automatically detect variables with common names in the two datasets and merge using them. + +Currently \code{year} must be an integer variable specifying the appropriate year. \code{agegroup} +must currently also specify one-year age groups, e.g. \code{popmort} specifies 101 age groups +of length 1 year. In both +\code{year} and \code{agegroup} variables the values are interpreted as the lower bounds of intervals +(and passed on to a \code{cut} call). The mandatory variable \code{haz} +must specify the appropriate average rate at the person-year level; +e.g. \code{haz = -log(survProb)} where \code{survProb} is a one-year conditional +survival probability will be the correct hazard specification. + +The corresponding \code{pophaz} population hazard value is merged by using the mid points +of the records after splitting as reference values. E.g. if \code{age=89.9} at the start +of a 1-year interval, then the reference age value is \code{90.4} for merging. +This way we get a "typical" population hazard level for each record. + +\strong{Computing Pohar-Perme weights} + +If \code{pp = TRUE}, Pohar-Perme weights +(the inverse of cumulative population survival) are computed. This will +create the new \code{pp} variable in the expanded data. \code{pp} is a +reserved name and \code{lexpand} throws exception if a variable with that name +exists in \code{data}. + +When a survival interval contains one or several rows per subject +(e.g. due to splitting by the \code{per} scale), +\code{pp} is cumulated from the beginning of the first record in a survival +interval for each subject to the mid-point of the remaining time within that +survival interval, and that value is given for every other record +that a given person has within the same survival interval. + +E.g. with 5 rows of duration \code{1/5} within a survival interval +\code{[0,1)]}, \code{pp} is determined for all records by a cumulative +population survival from \code{0} to \code{0.5}. Th existing accuracy is used, +so that the weight is cumulated first up to the end of the second row +and then over the remaining distance to the mid-point (first to 0.4, then to +0.5). This ensures that more accurately merged population hazards are fully +used. + +\strong{Event not at end of follow-up & overlapping time lines} + +\code{event} may be used if the event indicated by \code{status} should +occur at a time differing from \code{exit}. If \code{event} is defined, +\code{cutLexis} is used on the data set after coercing it to the \code{Lexis} +format and before splitting. Note that some values of \code{event} are allowed +to be \code{NA} as with \code{cutLexis} to accommodate observations +without an event occurring. + +Additionally, setting \code{overlapping = FALSE} ensures that (irrespective +of using \code{event}) the each subject defined by \code{id} only has one +continuous time line instead of possibly overlapping time lines if +there are multiple rows in \code{data} by \code{id}. + + +\strong{Aggregating} + +Certain analyses such as SIR/SMR calculations require tables of events and +person-years by the unique combinations (interactions) of several variables. +For this, \code{aggre} can be specified as a list of such variables +(preferably \code{factor} variables but not mandatory) + and any arbitrary functions of the +variables at one's disposal. E.g. + +\code{aggre = list(sex, agegr = cut(dg_age, 0:100))} + +would tabulate events and person-years by sex and an ad-hoc age group +variable. Every ad-hoc-created variable should be named. + +\code{fot}, \code{per}, and \code{age} are special reserved variables which, +when present in the \code{aggre} list, are output as categories of the +corresponding time scale variables by using +e.g. + +\code{cut(fot, breaks$fot, right=FALSE)}. + +This only works if +the corresponding breaks are defined in \code{breaks} or via "\code{...}". +E.g. + +\code{aggre = list(sex, fot.int = fot)} with + +\code{breaks = list(fot=0:5)}. + +The output variable \code{fot.int} in the above example will have +the lower limits of the appropriate intervals as values. + +\code{aggre} as a named list will output numbers of events and person-years +with the given new names as categorizing variable names, e.g. +\code{aggre = list(follow_up = fot, gender = sex, agegroup = age)}. + +The output table has person-years (\code{pyrs}) and event counts +(e.g. \code{from0to1}) as columns. Event counts are the numbers of transitions +(\code{lex.Cst != lex.Xst}) or the \code{lex.Xst} value at a subject's +last record (subject possibly defined by \code{id}). + +If \code{aggre.type = "unique"} (alias \code{"non-empty"}), +the above results are computed for existing +combinations of expressions given in \code{aggre}, but also for non-existing +combinations if \code{aggre.type = "cartesian"} (alias \code{"full"}). E.g. if a +factor variable has levels \code{"a", "b", "c"} but the data is limited +to only have levels \code{"a", "b"} present +(more than zero rows have these level values), the former setting only +computes results for \code{"a", "b"}, and the latter also for \code{"c"} +and any combination with other variables or expression given in \code{aggre}. +In essence, \code{"cartesian"} forces also combinations of variables used +in \code{aggre} that have no match in data to be shown in the result. + +If \code{aggre} is not \code{NULL} and \code{pophaz} has been supplied, +\code{lexpand} also aggregates the expected counts of events, which +appears in the output data by the reserved name \code{d.exp}. Additionally, +having \code{pp = TRUE} causes \code{lexpand} to also compute various +Pohar-Perme weighted figures necessary for computing Pohar-Perme net survivals +with \code{\link{survtab_ag}}. This can be slow, so consider what is really +needed. The Pohar-Perme weighted figures have the suffix \code{.pp}. +} +\examples{ +\dontrun{ +## prepare data for e.g. 5-year cohort survival calculation +x <- lexpand(sire, breaks=list(fot=seq(0, 5, by = 1/12)), + birth = bi_date, entry = dg_date, exit = ex_date, + status = status != 0, pophaz=popmort) + +## prepare data for e.g. 5-year "period analysis" for 2008-2012 +BL <- list(fot = seq(0, 5, by = 1/12), per = c("2008-01-01", "2013-01-01")) +x <- lexpand(sire, breaks = BL, + birth = bi_date, entry = dg_date, exit = ex_date, + pophaz=popmort, status = status != 0) + +## aggregating +BL <- list(fot = 0:5, per = c("2003-01-01","2008-01-01", "2013-01-01")) +ag <- lexpand(sire, breaks = BL, status = status != 0, + birth = bi_date, entry = dg_date, exit = ex_date, + aggre=list(sex, period = per, surv.int = fot)) + +## aggregating even more +ag <- lexpand(sire, breaks = BL, status = status != 0, + birth = bi_date, entry = dg_date, exit = ex_date, + aggre=list(sex, period = per, surv.int = fot), + pophaz = popmort, pp = TRUE) + +## using "..." +x <- lexpand(sire, fot=0:5, status = status != 0, + birth = bi_date, entry = dg_date, exit = ex_date, + pophaz=popmort) + +x <- lexpand(sire, fot=0:5, status = status != 0, + birth = bi_date, entry = dg_date, exit = ex_date, + aggre=list(sex, surv.int = fot)) + +## using the "event" argument: it just places the transition to given "status" +## at the "event" time instead of at the end, if possible using cutLexis +x <- lexpand(sire, status = status, event = dg_date, + birth = bi_date, entry = dg_date, exit = ex_date,) + +## aggregating with custom "event" time +## (the transition to status is moved to the "event" time) +x <- lexpand(sire, status = status, event = dg_date, + birth = bi_date, entry = dg_date, exit = ex_date, + per = 1970:2014, age = c(0:100,Inf), + aggre = list(sex, year = per, agegroup = age)) + +} + +} +\seealso{ +\code{\link[Epi]{Lexis}}, \code{\link{popmort}} + +Other splitting functions: \code{\link{splitLexisDT}}, + \code{\link{splitMulti}} + +Other aggregation functions: \code{\link{aggre}}, + \code{\link{as.aggre}}, \code{\link{setaggre}}, + \code{\link{summary.aggre}} +} +\author{ +Joonas Miettinen +} diff --git a/man/lines.sirspline.Rd b/man/lines.sirspline.Rd index af8c9ab..a176acb 100644 --- a/man/lines.sirspline.Rd +++ b/man/lines.sirspline.Rd @@ -1,42 +1,42 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/S3_definitions.R -\name{lines.sirspline} -\alias{lines.sirspline} -\title{lines method for sirspline-object} -\usage{ -\method{lines}{sirspline}(x, conf.int = TRUE, print.levels = NA, - select.spline, ...) -} -\arguments{ -\item{x}{an object returned by function sirspline} - -\item{conf.int}{logical; default TRUE draws also the 95 confidence intervals} - -\item{print.levels}{name(s) to be plotted. Default plots all levels.} - -\item{select.spline}{select which spline variable (a number or a name) is plotted.} - -\item{...}{arguments passed on to lines()} -} -\description{ -Plot SIR spline lines with R base graphics -} -\details{ -In \code{lines.sirspline} most of graphical parameters is user -adjustable. -Desired spline variable can be selected with \code{select.spline} and only one -can be plotted at a time. The spline variable can include -several levels, e.g. gender (these are the levels of \code{print} -from \code{sirspline}). All levels are printed by default, but a -specific level can be selected using argument -\code{print.levels}. Printing the levels separately enables e.g. to -give different colours for each level. -} -\seealso{ -Other sir functions: \code{\link{plot.sirspline}}, - \code{\link{sir_exp}}, \code{\link{sir_ratio}}, - \code{\link{sirspline}}, \code{\link{sir}} -} -\author{ -Matti Rantanen -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/S3_definitions.R +\name{lines.sirspline} +\alias{lines.sirspline} +\title{lines method for sirspline-object} +\usage{ +\method{lines}{sirspline}(x, conf.int = TRUE, print.levels = NA, + select.spline, ...) +} +\arguments{ +\item{x}{an object returned by function sirspline} + +\item{conf.int}{logical; default TRUE draws also the 95 confidence intervals} + +\item{print.levels}{name(s) to be plotted. Default plots all levels.} + +\item{select.spline}{select which spline variable (a number or a name) is plotted.} + +\item{...}{arguments passed on to lines()} +} +\description{ +Plot SIR spline lines with R base graphics +} +\details{ +In \code{lines.sirspline} most of graphical parameters is user +adjustable. +Desired spline variable can be selected with \code{select.spline} and only one +can be plotted at a time. The spline variable can include +several levels, e.g. gender (these are the levels of \code{print} +from \code{sirspline}). All levels are printed by default, but a +specific level can be selected using argument +\code{print.levels}. Printing the levels separately enables e.g. to +give different colours for each level. +} +\seealso{ +Other sir functions: \code{\link{plot.sirspline}}, + \code{\link{sir_exp}}, \code{\link{sir_ratio}}, + \code{\link{sirspline}}, \code{\link{sir}} +} +\author{ +Matti Rantanen +} diff --git a/man/lines.survmean.Rd b/man/lines.survmean.Rd index 0d668ff..4da1311 100644 --- a/man/lines.survmean.Rd +++ b/man/lines.survmean.Rd @@ -1,35 +1,35 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/S3_definitions.R -\name{lines.survmean} -\alias{lines.survmean} -\title{Graphically Inspect Curves Used in Mean Survival Computation} -\usage{ -\method{lines}{survmean}(x, ...) -} -\arguments{ -\item{x}{a \code{survmean} object} - -\item{...}{arguments passed (ultimately) to \code{matlines}; you -may, therefore, supply e.g. \code{lwd} through this, though arguments -such as \code{lty} and \code{col} will not work} -} -\description{ -Plots the observed (with extrapolation) and expected survival -curves for all strata in an object created by \code{\link{survmean}} -} -\details{ -This function is intended to be a workhorse for \code{\link{plot.survmean}}. -If you want finer control over the plotted curves, extract the curves from -the \code{survmean} output using - -\code{attr(x, "curves")} - -where \code{x} is a \code{survmean} object. -} -\seealso{ -Other survmean functions: \code{\link{plot.survmean}}, - \code{\link{survmean}} -} -\author{ -Joonas Miettinen -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/S3_definitions.R +\name{lines.survmean} +\alias{lines.survmean} +\title{Graphically Inspect Curves Used in Mean Survival Computation} +\usage{ +\method{lines}{survmean}(x, ...) +} +\arguments{ +\item{x}{a \code{survmean} object} + +\item{...}{arguments passed (ultimately) to \code{matlines}; you +may, therefore, supply e.g. \code{lwd} through this, though arguments +such as \code{lty} and \code{col} will not work} +} +\description{ +Plots the observed (with extrapolation) and expected survival +curves for all strata in an object created by \code{\link{survmean}} +} +\details{ +This function is intended to be a workhorse for \code{\link{plot.survmean}}. +If you want finer control over the plotted curves, extract the curves from +the \code{survmean} output using + +\code{attr(x, "curves")} + +where \code{x} is a \code{survmean} object. +} +\seealso{ +Other survmean functions: \code{\link{plot.survmean}}, + \code{\link{survmean}} +} +\author{ +Joonas Miettinen +} diff --git a/man/lines.survtab.Rd b/man/lines.survtab.Rd index bc30475..e060160 100644 --- a/man/lines.survtab.Rd +++ b/man/lines.survtab.Rd @@ -1,60 +1,60 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/S3_definitions.R -\name{lines.survtab} -\alias{lines.survtab} -\title{\code{lines} method for survtab objects} -\usage{ -\method{lines}{survtab}(x, y = NULL, subset = NULL, conf.int = TRUE, - col = NULL, lty = NULL, ...) -} -\arguments{ -\item{x}{a \code{survtab} output object} - -\item{y}{a variable to plot; a quoted name of a variable -in \code{x}; e.g. \code{y = "surv.obs"}; -if \code{NULL}, picks last survival variable column in order in \code{x}} - -\item{subset}{a logical condition; \code{obj} is subset accordingly -before plotting; use this for limiting to specific strata, -e.g. \code{subset = sex == "male"}} - -\item{conf.int}{logical; if \code{TRUE}, also plots any confidence intervals -present in \code{obj} for variables in \code{y}} - -\item{col}{line colour passed to \code{matlines}} - -\item{lty}{line type passed to \code{matlines}} - -\item{...}{additional arguments passed on to to a \code{matlines} call; -e.g. \code{lwd} can be defined this way} -} -\description{ -Plot \code{lines} from a \code{survtab} object -} -\examples{ -data(sire) -data(sibr) -si <- rbind(sire, sibr) -si$period <- cut(si$dg_date, as.Date(c("1993-01-01", "2004-01-01", "2013-01-01")), right = FALSE) -si$cancer <- c(rep("rectal", nrow(sire)), rep("breast", nrow(sibr))) -x <- lexpand(si, birth = bi_date, entry = dg_date, exit = ex_date, - status = status \%in\% 1:2, - fot = 0:5, aggre = list(cancer, period, fot)) -st <- survtab_ag(fot ~ cancer + period, data = x, - surv.method = "lifetable", surv.type = "surv.obs") - -plot(st, "surv.obs", subset = cancer == "breast", ylim = c(0.5, 1), col = "blue") -lines(st, "surv.obs", subset = cancer == "rectal", col = "red") - -## or -plot(st, "surv.obs", col = c(2,2,4,4), lty = c(1, 2, 1, 2)) -} -\seealso{ -Other survtab functions: \code{\link{plot.survtab}}, - \code{\link{print.survtab}}, - \code{\link{summary.survtab}}, \code{\link{survtab_ag}}, - \code{\link{survtab}} -} -\author{ -Joonas Miettinen -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/S3_definitions.R +\name{lines.survtab} +\alias{lines.survtab} +\title{\code{lines} method for survtab objects} +\usage{ +\method{lines}{survtab}(x, y = NULL, subset = NULL, conf.int = TRUE, + col = NULL, lty = NULL, ...) +} +\arguments{ +\item{x}{a \code{survtab} output object} + +\item{y}{a variable to plot; a quoted name of a variable +in \code{x}; e.g. \code{y = "surv.obs"}; +if \code{NULL}, picks last survival variable column in order in \code{x}} + +\item{subset}{a logical condition; \code{obj} is subset accordingly +before plotting; use this for limiting to specific strata, +e.g. \code{subset = sex == "male"}} + +\item{conf.int}{logical; if \code{TRUE}, also plots any confidence intervals +present in \code{obj} for variables in \code{y}} + +\item{col}{line colour passed to \code{matlines}} + +\item{lty}{line type passed to \code{matlines}} + +\item{...}{additional arguments passed on to to a \code{matlines} call; +e.g. \code{lwd} can be defined this way} +} +\description{ +Plot \code{lines} from a \code{survtab} object +} +\examples{ +data(sire) +data(sibr) +si <- rbind(sire, sibr) +si$period <- cut(si$dg_date, as.Date(c("1993-01-01", "2004-01-01", "2013-01-01")), right = FALSE) +si$cancer <- c(rep("rectal", nrow(sire)), rep("breast", nrow(sibr))) +x <- lexpand(si, birth = bi_date, entry = dg_date, exit = ex_date, + status = status \%in\% 1:2, + fot = 0:5, aggre = list(cancer, period, fot)) +st <- survtab_ag(fot ~ cancer + period, data = x, + surv.method = "lifetable", surv.type = "surv.obs") + +plot(st, "surv.obs", subset = cancer == "breast", ylim = c(0.5, 1), col = "blue") +lines(st, "surv.obs", subset = cancer == "rectal", col = "red") + +## or +plot(st, "surv.obs", col = c(2,2,4,4), lty = c(1, 2, 1, 2)) +} +\seealso{ +Other survtab functions: \code{\link{plot.survtab}}, + \code{\link{print.survtab}}, + \code{\link{summary.survtab}}, \code{\link{survtab_ag}}, + \code{\link{survtab}} +} +\author{ +Joonas Miettinen +} diff --git a/man/longDF2ratetable.Rd b/man/longDF2ratetable.Rd index 67b83ec..a0fb276 100644 --- a/man/longDF2ratetable.Rd +++ b/man/longDF2ratetable.Rd @@ -1,30 +1,30 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utility_functions.R -\name{longDF2ratetable} -\alias{longDF2ratetable} -\title{\strong{Experimental}: Coerce a long-format \code{data.frame} to a \code{ratetable} object} -\usage{ -longDF2ratetable(DF, value.var = "haz", by.vars = setdiff(names(DF), - value.var)) -} -\arguments{ -\item{DF}{a \code{data.frame}} - -\item{value.var}{name of values variable in quotes} - -\item{by.vars}{names vector of variables by which to create (array) dimensions} -} -\description{ -Coerces a long-format \code{data.frame} of population hazards -to an array, and in turn to a \code{\link[survival]{ratetable}}, -which can be used in e.g. \pkg{survival}'s expected survival computations -and \pkg{relsurv}'s relative survival computations. -} -\seealso{ -\code{\link[survival]{ratetable}}, -\code{\link{as.data.table.ratetable}}, -\code{\link{as.data.frame.ratetable}} -} -\author{ -Joonas Miettinen -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utility_functions.R +\name{longDF2ratetable} +\alias{longDF2ratetable} +\title{\strong{Experimental}: Coerce a long-format \code{data.frame} to a \code{ratetable} object} +\usage{ +longDF2ratetable(DF, value.var = "haz", by.vars = setdiff(names(DF), + value.var)) +} +\arguments{ +\item{DF}{a \code{data.frame}} + +\item{value.var}{name of values variable in quotes} + +\item{by.vars}{names vector of variables by which to create (array) dimensions} +} +\description{ +Coerces a long-format \code{data.frame} of population hazards +to an array, and in turn to a \code{\link[survival]{ratetable}}, +which can be used in e.g. \pkg{survival}'s expected survival computations +and \pkg{relsurv}'s relative survival computations. +} +\seealso{ +\code{\link[survival]{ratetable}}, +\code{\link{as.data.table.ratetable}}, +\code{\link{as.data.frame.ratetable}} +} +\author{ +Joonas Miettinen +} diff --git a/man/lower_bound.Rd b/man/lower_bound.Rd index a499c88..c08cb92 100644 --- a/man/lower_bound.Rd +++ b/man/lower_bound.Rd @@ -1,18 +1,18 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utility_functions.R -\name{lower_bound} -\alias{lower_bound} -\title{Return lower_bound value from char string (20,30]} -\usage{ -lower_bound(cut) -} -\arguments{ -\item{cut}{is a character vector of elements "(20,60]"} -} -\description{ -selects lowest values of each factor after cut() based -on that the value starts from index 2 and end in comma ",". -} -\author{ -Matti Rantanen -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utility_functions.R +\name{lower_bound} +\alias{lower_bound} +\title{Return lower_bound value from char string (20,30]} +\usage{ +lower_bound(cut) +} +\arguments{ +\item{cut}{is a character vector of elements "(20,60]"} +} +\description{ +selects lowest values of each factor after cut() based +on that the value starts from index 2 and end in comma ",". +} +\author{ +Matti Rantanen +} diff --git a/man/ltable.Rd b/man/ltable.Rd index f8de41c..c31eb46 100644 --- a/man/ltable.Rd +++ b/man/ltable.Rd @@ -1,145 +1,145 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/ltable.R -\name{ltable} -\alias{ltable} -\alias{expr.by.cj} -\title{Tabulate Counts and Other Functions by Multiple Variables into a -Long-Format Table} -\usage{ -ltable(data, by.vars = NULL, expr = list(obs = .N), subset = NULL, - use.levels = TRUE, na.rm = FALSE, robust = TRUE) - -expr.by.cj(data, by.vars = NULL, expr = list(obs = .N), subset = NULL, - use.levels = FALSE, na.rm = FALSE, robust = FALSE, .SDcols = NULL, - enclos = parent.frame(1L), ...) -} -\arguments{ -\item{data}{a \code{data.table}/\code{data.frame}} - -\item{by.vars}{names of variables that are used for categorization, -as a character vector, e.g. \code{c('sex','agegroup')}} - -\item{expr}{object or a list of objects where each object is a function -of a variable (see: details)} - -\item{subset}{a logical condition; data is limited accordingly before -evaluating \code{expr} - but the result of \code{expr} is also -returned as \code{NA} for levels not existing in the subset. See Examples.} - -\item{use.levels}{logical; if \code{TRUE}, uses factor levels of given -variables if present; if you want e.g. counts for levels -that actually have zero observations but are levels in a factor variable, -use this} - -\item{na.rm}{logical; if \code{TRUE}, drops rows in table that have -\code{NA} as values in any of \code{by.vars} columns} - -\item{robust}{logical; if \code{TRUE}, runs the output data's -\code{by.vars} columns through \code{robust_values} before outputting} - -\item{.SDcols}{advanced; a character vector of column names -passed to inside the data.table's brackets -\code{DT[, , ...]}; see \code{\link{data.table}}; if \code{NULL}, -uses all appropriate columns. See Examples for usage.} - -\item{enclos}{advanced; an environment; the enclosing -environment of the data.} - -\item{...}{advanced; other arguments passed to inside the -data.table's brackets \code{DT[, , ...]}; see \code{\link{data.table}}} -} -\description{ -\code{ltable} makes use of \code{data.table} -capabilities to tabulate frequencies or -arbitrary functions of given variables into a long format -\code{data.table}/\code{data.frame}. \code{expr.by.cj} is the -equivalent for more advanced users. -} -\details{ -Returns \code{expr} for each unique combination of given \code{by.vars}. - -By default makes use of any and all \code{\link{levels}} present for -each variable in \code{by.vars}. This is useful, -because even if a subset of the data does not contain observations -for e.g. a specific age group, those age groups are -nevertheless presented in the resulting table; e.g. with the default -\code{expr = list(obs = .N)} all age group levels -are represented by a row and can have \code{obs = 0}. - -The function differs from the -vanilla \code{\link{table}} by giving a long format table of values -regardless of the number of \code{by.vars} given. -Make use of e.g. \code{\link{cast_simple}} if data needs to be -presented in a wide format (e.g. a two-way table). - -The rows of the long-format table are effectively Cartesian products -of the levels of each variable in \code{by.vars}, -e.g. with \code{by.vars = c("sex", "area")} all levels of -\code{area} are repeated for both levels of \code{sex} -in the table. - -The \code{expr} allows the user to apply any function(s) on all -levels defined by \code{by.vars}. Here are some examples: -\itemize{ - \item .N or list(.N) is a function used inside a \code{data.table} to - calculate counts in each group - \item list(obs = .N), same as above but user assigned variable name - \item list(sum(obs), sum(pyrs), mean(dg_age)), multiple objects in a list - \item list(obs = sum(obs), pyrs = sum(pyrs)), same as above with user - defined variable names -} - -If \code{use.levels = FALSE}, no \code{levels} information will - be used. This means that if e.g. the \code{agegroup} -variable is a factor and has 18 levels defined, but only 15 levels - are present in the data, no rows for the missing -levels will be shown in the table. - -\code{na.rm} simply drops any rows from the resulting table where -any of the \code{by.vars} values was \code{NA}. -} -\section{Functions}{ -\itemize{ -\item \code{expr.by.cj}: Somewhat more streamlined \code{ltable} with -defaults for speed. Explicit determination of enclosing environment -of data. -}} - -\examples{ -data("sire", package = "popEpi") -sr <- sire -sr$agegroup <- cut(sr$dg_age, breaks=c(0,45,60,75,85,Inf)) -## counts by default -ltable(sr, "agegroup") - -## any expression can be given -ltable(sr, "agegroup", list(mage = mean(dg_age))) -ltable(sr, "agegroup", list(mage = mean(dg_age), vage = var(dg_age))) - -## also returns levels where there are zero rows (expressions as NA) -ltable(sr, "agegroup", list(obs = .N, - minage = min(dg_age), - maxage = max(dg_age)), - subset = dg_age < 85) - -#### expr.by.cj -expr.by.cj(sr, "agegroup") - -## any arbitrary expression can be given -expr.by.cj(sr, "agegroup", list(mage = mean(dg_age))) -expr.by.cj(sr, "agegroup", list(mage = mean(dg_age), vage = var(dg_age))) - -## only uses levels of by.vars present in data -expr.by.cj(sr, "agegroup", list(mage = mean(dg_age), vage = var(dg_age)), - subset = dg_age < 70) - -## .SDcols trick -expr.by.cj(sr, "agegroup", lapply(.SD, mean), - subset = dg_age < 70, .SDcols = c("dg_age", "status")) -} -\seealso{ -\code{\link{table}}, \code{\link{cast_simple}}, \code{\link{melt}} -} -\author{ -Joonas Miettinen, Matti Rantanen -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ltable.R +\name{ltable} +\alias{ltable} +\alias{expr.by.cj} +\title{Tabulate Counts and Other Functions by Multiple Variables into a +Long-Format Table} +\usage{ +ltable(data, by.vars = NULL, expr = list(obs = .N), subset = NULL, + use.levels = TRUE, na.rm = FALSE, robust = TRUE) + +expr.by.cj(data, by.vars = NULL, expr = list(obs = .N), subset = NULL, + use.levels = FALSE, na.rm = FALSE, robust = FALSE, .SDcols = NULL, + enclos = parent.frame(1L), ...) +} +\arguments{ +\item{data}{a \code{data.table}/\code{data.frame}} + +\item{by.vars}{names of variables that are used for categorization, +as a character vector, e.g. \code{c('sex','agegroup')}} + +\item{expr}{object or a list of objects where each object is a function +of a variable (see: details)} + +\item{subset}{a logical condition; data is limited accordingly before +evaluating \code{expr} - but the result of \code{expr} is also +returned as \code{NA} for levels not existing in the subset. See Examples.} + +\item{use.levels}{logical; if \code{TRUE}, uses factor levels of given +variables if present; if you want e.g. counts for levels +that actually have zero observations but are levels in a factor variable, +use this} + +\item{na.rm}{logical; if \code{TRUE}, drops rows in table that have +\code{NA} as values in any of \code{by.vars} columns} + +\item{robust}{logical; if \code{TRUE}, runs the output data's +\code{by.vars} columns through \code{robust_values} before outputting} + +\item{.SDcols}{advanced; a character vector of column names +passed to inside the data.table's brackets +\code{DT[, , ...]}; see \code{\link{data.table}}; if \code{NULL}, +uses all appropriate columns. See Examples for usage.} + +\item{enclos}{advanced; an environment; the enclosing +environment of the data.} + +\item{...}{advanced; other arguments passed to inside the +data.table's brackets \code{DT[, , ...]}; see \code{\link{data.table}}} +} +\description{ +\code{ltable} makes use of \code{data.table} +capabilities to tabulate frequencies or +arbitrary functions of given variables into a long format +\code{data.table}/\code{data.frame}. \code{expr.by.cj} is the +equivalent for more advanced users. +} +\details{ +Returns \code{expr} for each unique combination of given \code{by.vars}. + +By default makes use of any and all \code{\link{levels}} present for +each variable in \code{by.vars}. This is useful, +because even if a subset of the data does not contain observations +for e.g. a specific age group, those age groups are +nevertheless presented in the resulting table; e.g. with the default +\code{expr = list(obs = .N)} all age group levels +are represented by a row and can have \code{obs = 0}. + +The function differs from the +vanilla \code{\link{table}} by giving a long format table of values +regardless of the number of \code{by.vars} given. +Make use of e.g. \code{\link{cast_simple}} if data needs to be +presented in a wide format (e.g. a two-way table). + +The rows of the long-format table are effectively Cartesian products +of the levels of each variable in \code{by.vars}, +e.g. with \code{by.vars = c("sex", "area")} all levels of +\code{area} are repeated for both levels of \code{sex} +in the table. + +The \code{expr} allows the user to apply any function(s) on all +levels defined by \code{by.vars}. Here are some examples: +\itemize{ + \item .N or list(.N) is a function used inside a \code{data.table} to + calculate counts in each group + \item list(obs = .N), same as above but user assigned variable name + \item list(sum(obs), sum(pyrs), mean(dg_age)), multiple objects in a list + \item list(obs = sum(obs), pyrs = sum(pyrs)), same as above with user + defined variable names +} + +If \code{use.levels = FALSE}, no \code{levels} information will + be used. This means that if e.g. the \code{agegroup} +variable is a factor and has 18 levels defined, but only 15 levels + are present in the data, no rows for the missing +levels will be shown in the table. + +\code{na.rm} simply drops any rows from the resulting table where +any of the \code{by.vars} values was \code{NA}. +} +\section{Functions}{ +\itemize{ +\item \code{expr.by.cj}: Somewhat more streamlined \code{ltable} with +defaults for speed. Explicit determination of enclosing environment +of data. +}} + +\examples{ +data("sire", package = "popEpi") +sr <- sire +sr$agegroup <- cut(sr$dg_age, breaks=c(0,45,60,75,85,Inf)) +## counts by default +ltable(sr, "agegroup") + +## any expression can be given +ltable(sr, "agegroup", list(mage = mean(dg_age))) +ltable(sr, "agegroup", list(mage = mean(dg_age), vage = var(dg_age))) + +## also returns levels where there are zero rows (expressions as NA) +ltable(sr, "agegroup", list(obs = .N, + minage = min(dg_age), + maxage = max(dg_age)), + subset = dg_age < 85) + +#### expr.by.cj +expr.by.cj(sr, "agegroup") + +## any arbitrary expression can be given +expr.by.cj(sr, "agegroup", list(mage = mean(dg_age))) +expr.by.cj(sr, "agegroup", list(mage = mean(dg_age), vage = var(dg_age))) + +## only uses levels of by.vars present in data +expr.by.cj(sr, "agegroup", list(mage = mean(dg_age), vage = var(dg_age)), + subset = dg_age < 70) + +## .SDcols trick +expr.by.cj(sr, "agegroup", lapply(.SD, mean), + subset = dg_age < 70, .SDcols = c("dg_age", "status")) +} +\seealso{ +\code{\link{table}}, \code{\link{cast_simple}}, \code{\link{melt}} +} +\author{ +Joonas Miettinen, Matti Rantanen +} diff --git a/man/makeWeightsDT.Rd b/man/makeWeightsDT.Rd index 8a2e890..dcbaf0b 100644 --- a/man/makeWeightsDT.Rd +++ b/man/makeWeightsDT.Rd @@ -1,153 +1,153 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/weighted_table.R -\name{makeWeightsDT} -\alias{makeWeightsDT} -\title{Make a \code{data.table} of Tabulated, Aggregated Values and Weights} -\usage{ -makeWeightsDT(data, values = NULL, print = NULL, adjust = NULL, - formula = NULL, Surv.response = TRUE, by.other = NULL, - custom.levels = NULL, custom.levels.cut.low = NULL, weights = NULL, - internal.weights.values = NULL, enclos = NULL, NA.text = NULL) -} -\arguments{ -\item{data}{DF/DT; passed to \code{envir} in \code{eval}} - -\item{values}{values to tabulate. Anything \code{evalPopArg} can evaluate.} - -\item{print}{variables to tabulate by and include in \code{prVars} in attributes} - -\item{adjust}{variables to tabulate by and include in \code{adVars} in attributes} - -\item{formula}{a formula such as \code{fot ~ sex} or \code{Surv(fot, lex.Xst) ~ sex}} - -\item{Surv.response}{logical, if \code{TRUE} throws error if response in -\code{formula} is not a \code{Surv} object and vice versa} - -\item{by.other}{other variables to tabulate by and include -in \code{boVars} in attributes} - -\item{custom.levels}{a named list of values. When "inflating" the data -in the cross-join / cartesian join sense (try e.g. \code{merge(1:5, 1:2)}), -one can supply the levels to inflate by using this to ensure inflation is full. -E.g. data might only have levels present to do inflation analogous to -\code{merge(2:5, 1:2)} although \code{merge(1:5, 1:2)} is intended and -needed.} - -\item{custom.levels.cut.low}{a character string vector of variable names. -These variables mentioned in \code{custom.levels} and existing in data -or first modified (in data) using \code{cutLow()} (essentially -\code{cut()} with \code{right = FALSE} and returning the lower bounds -as values). Handy for aggregating data e.g. to survival intervals. -\strong{NOTE}: the appropriate elements in \code{custom.levels} for these -variables must exceptionally contain an extra value as the roof used in -cutting, which will not be used in "inflating" the table using a merge. -See Examples.} - -\item{weights}{a named list or long-form data.frame of weights. See Examples.} - -\item{internal.weights.values}{the variable to use to compute internal -weights; only used if \code{weights = "internal"}.} - -\item{enclos}{the enclosing environment passed on to \code{eval}. Variables -not found in \code{data} or searched for here.} - -\item{NA.text}{a character string to display in a \code{warning} -if there are any rows with missing \code{values} or \code{adjust} values. -\strong{special:} key phrase \code{\%\%NA_COUNT\%\%} in text is replaced -with the count of missing observations. -E.g. \code{"Missing \%\%NA_COUNTS\%\% observations due to derpness."}} -} -\description{ -An internal function that aggregates a table -and merges in weights. -} -\examples{ -library(survival) -library(data.table) - -makeWeightsDT <- popEpi:::makeWeightsDT ## this avoids errors during tests - -sire <- copy(popEpi::sire) -set.seed(1L) -sire$sex <- rbinom(nrow(sire), 1, 0.5) -ag <- lexpand(sire, birth = "bi_date", entry = "dg_date", exit = "ex_date", - status = status \%in\% 1:2, pophaz = popmort, pp = FALSE, - aggre = list(sex, agegr = cut(dg_age, c(0,50,75,Inf)), fot), - fot = seq(0, 5, 1/12)) -ps <- quote(list(sex, fot)) -as <- quote(list(agegr)) -vs <- list(quote(list(pyrs, at.risk))) -ws <- list(agegr = c(0.2,0.4,0.4)) - -#### custom.levels usage -fb <- seq(0, 5-1/12, 1/12) ## exclude 5 as no row has that value -ag2 <- ag[fot > 0.5,] -# repeats fot intervals < 0.5 as empty rows -# may be the safest way to do this -dt <- makeWeightsDT(ag2, print = ps, adjust = as, - values = vs, weights = ws, - custom.levels = list(fot = fb)) -## aggregate from intervals seq(0, 5, 1/12) to 0:5 -fb2 <- 0:5 ## (this time we include 5 as the roof) -dt <- makeWeightsDT(ag2, print = ps, adjust = as, - values = vs, weights = ws, - custom.levels = list(fot = fb2), - custom.levels.cut.low = "fot") - - -#### use of enclos -TF <- environment() -gender <- factor(ag$sex) -dt <- makeWeightsDT(ag, print = quote(gender), adjust = as, - values = vs, weights = ws, enclos = TF) -## or NULL: uses calling frame by default. -dt <- makeWeightsDT(ag, print = quote(gender), adjust = as, - values = vs, weights = ws, - enclos = NULL) -## passing parent.fram(1) is the same thing (as below), -## but won't pass in testing these examples somehow (but work in real life) -# dt <- makeWeightsDT(ag, print = quote(gender), adjust = as, -# values = vs, weights = ws, -# enclos = NULL) - -#### formula usage -form <- Surv(fot, factor(from0to1))~gender -dt <- makeWeightsDT(ag, formula = form, Surv.response = TRUE, - adjust = as, values = vs, weights = ws, - enclos = NULL) - -## or -form <- Surv(fot, factor(from0to1))~gender + adjust(agegr) -dt <- makeWeightsDT(ag, formula = form, Surv.response = TRUE, - adjust = NULL, values = vs, weights = ws, - enclos = NULL) - -## or -form <- from0to1 ~ fot + gender + adjust(agegr) -dt <- makeWeightsDT(ag, formula = form, Surv.response = FALSE, - adjust = NULL, values = vs, weights = ws, - enclos = NULL) - -form <- from0to1 ~ fot + adjust(agegr) + adjust(sex) -ws2 <- list(agegr = c(0.33, 0.33, 0.33), sex = c(0.5, 0.5)) -dt <- makeWeightsDT(ag, formula = form, Surv.response = FALSE, - adjust = NULL, values = vs, weights = ws2, - enclos = NULL) - -## international standard pops -ag <- lexpand(sire, birth = "bi_date", entry = "dg_date", exit = "ex_date", - status = status \%in\% 1:2, pophaz = popmort, pp = FALSE, - aggre = list(sex, agegr = cut(dg_age, c(seq(0, 85, 5), Inf)), fot), - fot = seq(0, 5, 1/12)) - -form <- from0to1 ~ fot + adjust(agegr) -dt <- makeWeightsDT(ag, formula = form, Surv.response = FALSE, - adjust = NULL, values = vs, weights = "world_1966_18of5", - enclos = NULL) - -form <- from0to1 ~ fot + adjust(agegr, sex) -dt <- makeWeightsDT(ag, formula = form, Surv.response = FALSE, - adjust = NULL, values = vs, - weights = list(agegr = "nordic_2000_18of5", sex=c(1,1)), - enclos = NULL) -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/weighted_table.R +\name{makeWeightsDT} +\alias{makeWeightsDT} +\title{Make a \code{data.table} of Tabulated, Aggregated Values and Weights} +\usage{ +makeWeightsDT(data, values = NULL, print = NULL, adjust = NULL, + formula = NULL, Surv.response = TRUE, by.other = NULL, + custom.levels = NULL, custom.levels.cut.low = NULL, weights = NULL, + internal.weights.values = NULL, enclos = NULL, NA.text = NULL) +} +\arguments{ +\item{data}{DF/DT; passed to \code{envir} in \code{eval}} + +\item{values}{values to tabulate. Anything \code{evalPopArg} can evaluate.} + +\item{print}{variables to tabulate by and include in \code{prVars} in attributes} + +\item{adjust}{variables to tabulate by and include in \code{adVars} in attributes} + +\item{formula}{a formula such as \code{fot ~ sex} or \code{Surv(fot, lex.Xst) ~ sex}} + +\item{Surv.response}{logical, if \code{TRUE} throws error if response in +\code{formula} is not a \code{Surv} object and vice versa} + +\item{by.other}{other variables to tabulate by and include +in \code{boVars} in attributes} + +\item{custom.levels}{a named list of values. When "inflating" the data +in the cross-join / cartesian join sense (try e.g. \code{merge(1:5, 1:2)}), +one can supply the levels to inflate by using this to ensure inflation is full. +E.g. data might only have levels present to do inflation analogous to +\code{merge(2:5, 1:2)} although \code{merge(1:5, 1:2)} is intended and +needed.} + +\item{custom.levels.cut.low}{a character string vector of variable names. +These variables mentioned in \code{custom.levels} and existing in data +or first modified (in data) using \code{cutLow()} (essentially +\code{cut()} with \code{right = FALSE} and returning the lower bounds +as values). Handy for aggregating data e.g. to survival intervals. +\strong{NOTE}: the appropriate elements in \code{custom.levels} for these +variables must exceptionally contain an extra value as the roof used in +cutting, which will not be used in "inflating" the table using a merge. +See Examples.} + +\item{weights}{a named list or long-form data.frame of weights. See Examples.} + +\item{internal.weights.values}{the variable to use to compute internal +weights; only used if \code{weights = "internal"}.} + +\item{enclos}{the enclosing environment passed on to \code{eval}. Variables +not found in \code{data} or searched for here.} + +\item{NA.text}{a character string to display in a \code{warning} +if there are any rows with missing \code{values} or \code{adjust} values. +\strong{special:} key phrase \code{\%\%NA_COUNT\%\%} in text is replaced +with the count of missing observations. +E.g. \code{"Missing \%\%NA_COUNTS\%\% observations due to derpness."}} +} +\description{ +An internal function that aggregates a table +and merges in weights. +} +\examples{ +library(survival) +library(data.table) + +makeWeightsDT <- popEpi:::makeWeightsDT ## this avoids errors during tests + +sire <- copy(popEpi::sire) +set.seed(1L) +sire$sex <- rbinom(nrow(sire), 1, 0.5) +ag <- lexpand(sire, birth = "bi_date", entry = "dg_date", exit = "ex_date", + status = status \%in\% 1:2, pophaz = popmort, pp = FALSE, + aggre = list(sex, agegr = cut(dg_age, c(0,50,75,Inf)), fot), + fot = seq(0, 5, 1/12)) +ps <- quote(list(sex, fot)) +as <- quote(list(agegr)) +vs <- list(quote(list(pyrs, at.risk))) +ws <- list(agegr = c(0.2,0.4,0.4)) + +#### custom.levels usage +fb <- seq(0, 5-1/12, 1/12) ## exclude 5 as no row has that value +ag2 <- ag[fot > 0.5,] +# repeats fot intervals < 0.5 as empty rows +# may be the safest way to do this +dt <- makeWeightsDT(ag2, print = ps, adjust = as, + values = vs, weights = ws, + custom.levels = list(fot = fb)) +## aggregate from intervals seq(0, 5, 1/12) to 0:5 +fb2 <- 0:5 ## (this time we include 5 as the roof) +dt <- makeWeightsDT(ag2, print = ps, adjust = as, + values = vs, weights = ws, + custom.levels = list(fot = fb2), + custom.levels.cut.low = "fot") + + +#### use of enclos +TF <- environment() +gender <- factor(ag$sex) +dt <- makeWeightsDT(ag, print = quote(gender), adjust = as, + values = vs, weights = ws, enclos = TF) +## or NULL: uses calling frame by default. +dt <- makeWeightsDT(ag, print = quote(gender), adjust = as, + values = vs, weights = ws, + enclos = NULL) +## passing parent.fram(1) is the same thing (as below), +## but won't pass in testing these examples somehow (but work in real life) +# dt <- makeWeightsDT(ag, print = quote(gender), adjust = as, +# values = vs, weights = ws, +# enclos = NULL) + +#### formula usage +form <- Surv(fot, factor(from0to1))~gender +dt <- makeWeightsDT(ag, formula = form, Surv.response = TRUE, + adjust = as, values = vs, weights = ws, + enclos = NULL) + +## or +form <- Surv(fot, factor(from0to1))~gender + adjust(agegr) +dt <- makeWeightsDT(ag, formula = form, Surv.response = TRUE, + adjust = NULL, values = vs, weights = ws, + enclos = NULL) + +## or +form <- from0to1 ~ fot + gender + adjust(agegr) +dt <- makeWeightsDT(ag, formula = form, Surv.response = FALSE, + adjust = NULL, values = vs, weights = ws, + enclos = NULL) + +form <- from0to1 ~ fot + adjust(agegr) + adjust(sex) +ws2 <- list(agegr = c(0.33, 0.33, 0.33), sex = c(0.5, 0.5)) +dt <- makeWeightsDT(ag, formula = form, Surv.response = FALSE, + adjust = NULL, values = vs, weights = ws2, + enclos = NULL) + +## international standard pops +ag <- lexpand(sire, birth = "bi_date", entry = "dg_date", exit = "ex_date", + status = status \%in\% 1:2, pophaz = popmort, pp = FALSE, + aggre = list(sex, agegr = cut(dg_age, c(seq(0, 85, 5), Inf)), fot), + fot = seq(0, 5, 1/12)) + +form <- from0to1 ~ fot + adjust(agegr) +dt <- makeWeightsDT(ag, formula = form, Surv.response = FALSE, + adjust = NULL, values = vs, weights = "world_1966_18of5", + enclos = NULL) + +form <- from0to1 ~ fot + adjust(agegr, sex) +dt <- makeWeightsDT(ag, formula = form, Surv.response = FALSE, + adjust = NULL, values = vs, + weights = list(agegr = "nordic_2000_18of5", sex=c(1,1)), + enclos = NULL) +} diff --git a/man/meanpop_fi.Rd b/man/meanpop_fi.Rd index d3a41ba..16ad4eb 100644 --- a/man/meanpop_fi.Rd +++ b/man/meanpop_fi.Rd @@ -1,29 +1,29 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/data_document.R -\name{meanpop_fi} -\alias{meanpop_fi} -\title{Mean population counts in Finland year, sex, and age group.} -\format{\code{data.table} with columns -\itemize{ - \item \code{sex} gender coded as male, female (0, 1) - \item \code{year} calendar year 1981-2016 - \item \code{agegroup} - coded 0 to 100; one-year age groups - \item \code{meanpop} the mean population count; that is, the mean of the - annual population counts of two consecutive years; e.g. for 1990 - \code{meanpop} is the mean of population counts for 1990 and 1991 - (counted at 1990-01-01 and 1991-01-01, respectively) -}} -\source{ -Statistics Finland -} -\description{ -Mean population counts in Finland year, sex, and age group. -} -\seealso{ -\href{http://pxnet2.stat.fi/PXWeb/pxweb/fi/StatFin/StatFin__vrm__vaerak/071_vaerak_tau_109.px/?rxid=81efcb98-00c6-46ba-9f8f-8bc6f110895f}{Table on the web} - -Other popEpi data: \code{\link{ICSS}}, - \code{\link{popmort}}, \code{\link{sibr}}, - \code{\link{sire}}, \code{\link{stdpop101}}, - \code{\link{stdpop18}} -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data_document.R +\name{meanpop_fi} +\alias{meanpop_fi} +\title{Mean population counts in Finland year, sex, and age group.} +\format{\code{data.table} with columns +\itemize{ + \item \code{sex} gender coded as male, female (0, 1) + \item \code{year} calendar year 1981-2016 + \item \code{agegroup} - coded 0 to 100; one-year age groups + \item \code{meanpop} the mean population count; that is, the mean of the + annual population counts of two consecutive years; e.g. for 1990 + \code{meanpop} is the mean of population counts for 1990 and 1991 + (counted at 1990-01-01 and 1991-01-01, respectively) +}} +\source{ +Statistics Finland +} +\description{ +Mean population counts in Finland year, sex, and age group. +} +\seealso{ +\href{http://pxnet2.stat.fi/PXWeb/pxweb/fi/StatFin/StatFin__vrm__vaerak/071_vaerak_tau_109.px/?rxid=81efcb98-00c6-46ba-9f8f-8bc6f110895f}{Table on the web} + +Other popEpi data: \code{\link{ICSS}}, + \code{\link{popmort}}, \code{\link{sibr}}, + \code{\link{sire}}, \code{\link{stdpop101}}, + \code{\link{stdpop18}} +} diff --git a/man/na2zero.Rd b/man/na2zero.Rd index 58b81f8..1a8f8f5 100644 --- a/man/na2zero.Rd +++ b/man/na2zero.Rd @@ -1,27 +1,27 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utility_functions.R -\name{na2zero} -\alias{na2zero} -\title{Convert NA's to zero in data.table} -\usage{ -na2zero(DT, vars = NULL) -} -\arguments{ -\item{DT}{\code{data.table} object} - -\item{vars}{a character string vector of variables names in \code{DT}; -if \code{NULL}, uses all variable names in \code{DT}} -} -\description{ -Given a \code{data.table DT}, replaces any \code{NA} values -in the variables given in \code{vars} in \code{DT}. Takes a copy of the -original data and returns the modified copy. -} -\details{ -Given a \code{data.table} object, converts \code{NA} values -to numeric (double) zeros for all variables named in \code{vars} or -all variables if \code{vars = NULL}. -} -\author{ -Joonas Miettinen -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utility_functions.R +\name{na2zero} +\alias{na2zero} +\title{Convert NA's to zero in data.table} +\usage{ +na2zero(DT, vars = NULL) +} +\arguments{ +\item{DT}{\code{data.table} object} + +\item{vars}{a character string vector of variables names in \code{DT}; +if \code{NULL}, uses all variable names in \code{DT}} +} +\description{ +Given a \code{data.table DT}, replaces any \code{NA} values +in the variables given in \code{vars} in \code{DT}. Takes a copy of the +original data and returns the modified copy. +} +\details{ +Given a \code{data.table} object, converts \code{NA} values +to numeric (double) zeros for all variables named in \code{vars} or +all variables if \code{vars = NULL}. +} +\author{ +Joonas Miettinen +} diff --git a/man/plot.rate.Rd b/man/plot.rate.Rd index 7a066f1..43ca390 100644 --- a/man/plot.rate.Rd +++ b/man/plot.rate.Rd @@ -1,33 +1,33 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/S3_definitions.R -\name{plot.rate} -\alias{plot.rate} -\title{plot method for rate object} -\usage{ -\method{plot}{rate}(x, conf.int = TRUE, eps = 0.2, left.margin, xlim, ...) -} -\arguments{ -\item{x}{a rate object (see \code{\link{rate}})} - -\item{conf.int}{logical; default TRUE draws the confidence intervals} - -\item{eps}{is the height of the ending of the error bars} - -\item{left.margin}{set a custom left margin for long variable names. Function -tries to do it by default.} - -\item{xlim}{change the x-axis location} - -\item{...}{arguments passed on to graphical functions points and segment -(e.g. \code{col}, \code{lwd}, \code{pch} and \code{cex})} -} -\description{ -Plot rate estimates with confidence intervals lines using R base graphics -} -\details{ -This is limited explanatory tool but most graphical -parameters are user adjustable. -} -\author{ -Matti Rantanen -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/S3_definitions.R +\name{plot.rate} +\alias{plot.rate} +\title{plot method for rate object} +\usage{ +\method{plot}{rate}(x, conf.int = TRUE, eps = 0.2, left.margin, xlim, ...) +} +\arguments{ +\item{x}{a rate object (see \code{\link{rate}})} + +\item{conf.int}{logical; default TRUE draws the confidence intervals} + +\item{eps}{is the height of the ending of the error bars} + +\item{left.margin}{set a custom left margin for long variable names. Function +tries to do it by default.} + +\item{xlim}{change the x-axis location} + +\item{...}{arguments passed on to graphical functions points and segment +(e.g. \code{col}, \code{lwd}, \code{pch} and \code{cex})} +} +\description{ +Plot rate estimates with confidence intervals lines using R base graphics +} +\details{ +This is limited explanatory tool but most graphical +parameters are user adjustable. +} +\author{ +Matti Rantanen +} diff --git a/man/plot.sir.Rd b/man/plot.sir.Rd index 1143f26..99c4927 100644 --- a/man/plot.sir.Rd +++ b/man/plot.sir.Rd @@ -1,70 +1,70 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/S3_definitions.R -\name{plot.sir} -\alias{plot.sir} -\title{Plot method for sir-object} -\usage{ -\method{plot}{sir}(x, conf.int = TRUE, ylab, xlab, xlim, main, eps = 0.2, - abline = TRUE, log = FALSE, left.margin, ...) -} -\arguments{ -\item{x}{an object returned by function \code{sir}} - -\item{conf.int}{default TRUE draws confidence intervals} - -\item{ylab}{overwrites default y-axis label} - -\item{xlab}{overwrites default x-axis label} - -\item{xlim}{x-axis minimum and maximum values} - -\item{main}{optional plot title} - -\item{eps}{error bar vertical bar height (works only in 'model' or 'univariate')} - -\item{abline}{logical; draws a grey line in SIR = 1} - -\item{log}{logical; SIR is not in log scale by default} - -\item{left.margin}{adjust left marginal of the plot to fit long variable names} - -\item{...}{arguments passed on to plot(), segment and lines()} -} -\description{ -Plot SIR estimates with error bars -} -\details{ -Plot SIR estimates and confidence intervals -\itemize{ - \item univariate - plots SIR with univariate confidence intervals - \item model - plots SIR with Poisson modelled confidence intervals -} - -\strong{Customize} -Normal plot parameters can be passed to \code{plot}. These can be a vector when plotting error bars: -\itemize{ - \item \code{pch} - point type - \item \code{lty} - line type - \item \code{col} - line/point colour - \item \code{lwd} - point/line size -} - -\strong{Tips for plotting splines} -It's possible to use \code{plot} to first draw the -confidence intervals using specific line type or colour and then plotting -again the estimate using \code{lines(... , conf.int = FALSE)} with different -settings. This works only when \code{plot.type} is 'splines'. -} -\examples{ -\dontrun{ -# Plot SIR estimates -# plot(sir.by.gender, col = c(4,2), log=FALSE, eps=0.2, lty=1, lwd=2, pch=19, -# main = 'SIR by gender', abline=TRUE) -} -} -\seealso{ -\code{\link{sir}}, \code{\link{sirspline}} -} -\author{ -Matti Rantanen -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/S3_definitions.R +\name{plot.sir} +\alias{plot.sir} +\title{Plot method for sir-object} +\usage{ +\method{plot}{sir}(x, conf.int = TRUE, ylab, xlab, xlim, main, eps = 0.2, + abline = TRUE, log = FALSE, left.margin, ...) +} +\arguments{ +\item{x}{an object returned by function \code{sir}} + +\item{conf.int}{default TRUE draws confidence intervals} + +\item{ylab}{overwrites default y-axis label} + +\item{xlab}{overwrites default x-axis label} + +\item{xlim}{x-axis minimum and maximum values} + +\item{main}{optional plot title} + +\item{eps}{error bar vertical bar height (works only in 'model' or 'univariate')} + +\item{abline}{logical; draws a grey line in SIR = 1} + +\item{log}{logical; SIR is not in log scale by default} + +\item{left.margin}{adjust left marginal of the plot to fit long variable names} + +\item{...}{arguments passed on to plot(), segment and lines()} +} +\description{ +Plot SIR estimates with error bars +} +\details{ +Plot SIR estimates and confidence intervals +\itemize{ + \item univariate - plots SIR with univariate confidence intervals + \item model - plots SIR with Poisson modelled confidence intervals +} + +\strong{Customize} +Normal plot parameters can be passed to \code{plot}. These can be a vector when plotting error bars: +\itemize{ + \item \code{pch} - point type + \item \code{lty} - line type + \item \code{col} - line/point colour + \item \code{lwd} - point/line size +} + +\strong{Tips for plotting splines} +It's possible to use \code{plot} to first draw the +confidence intervals using specific line type or colour and then plotting +again the estimate using \code{lines(... , conf.int = FALSE)} with different +settings. This works only when \code{plot.type} is 'splines'. +} +\examples{ +\dontrun{ +# Plot SIR estimates +# plot(sir.by.gender, col = c(4,2), log=FALSE, eps=0.2, lty=1, lwd=2, pch=19, +# main = 'SIR by gender', abline=TRUE) +} +} +\seealso{ +\code{\link{sir}}, \code{\link{sirspline}} +} +\author{ +Matti Rantanen +} diff --git a/man/plot.sirspline.Rd b/man/plot.sirspline.Rd index a3e2fe2..918ef5f 100644 --- a/man/plot.sirspline.Rd +++ b/man/plot.sirspline.Rd @@ -1,46 +1,46 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/S3_definitions.R -\name{plot.sirspline} -\alias{plot.sirspline} -\title{\code{plot} method for sirspline-object} -\usage{ -\method{plot}{sirspline}(x, conf.int = TRUE, abline = TRUE, log = FALSE, - type, ylab, xlab, ...) -} -\arguments{ -\item{x}{an object returned by function sirspline} - -\item{conf.int}{logical; default TRUE draws also the 95 confidence intervals} - -\item{abline}{logical; draws a reference line where SIR = 1} - -\item{log}{logical; default FALSE. Should the y-axis be in log scale} - -\item{type}{select \code{type = 'n'} to plot only figure frames} - -\item{ylab}{overwrites default y-axis label; can be a vector if multiple splines fitted} - -\item{xlab}{overwrites default x-axis label; can be a vector if multiple splines fitted} - -\item{...}{arguments passed on to plot()} -} -\description{ -Plot SIR splines using R base graphics. -} -\details{ -In \code{plot.sirspline} almost every graphical parameter are user -adjustable, such as \code{ylim}, \code{xlim}. -\code{plot.sirsplines} calls \code{lines.splines} to add lines. - -The plot axis without lines can be plotted using option \code{type = 'n'}. -On top of the frame it's then possible to add a \code{grid}, -\code{abline} or text before plotting the lines (see: \code{sirspline}). -} -\seealso{ -Other sir functions: \code{\link{lines.sirspline}}, - \code{\link{sir_exp}}, \code{\link{sir_ratio}}, - \code{\link{sirspline}}, \code{\link{sir}} -} -\author{ -Matti Rantanen -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/S3_definitions.R +\name{plot.sirspline} +\alias{plot.sirspline} +\title{\code{plot} method for sirspline-object} +\usage{ +\method{plot}{sirspline}(x, conf.int = TRUE, abline = TRUE, log = FALSE, + type, ylab, xlab, ...) +} +\arguments{ +\item{x}{an object returned by function sirspline} + +\item{conf.int}{logical; default TRUE draws also the 95 confidence intervals} + +\item{abline}{logical; draws a reference line where SIR = 1} + +\item{log}{logical; default FALSE. Should the y-axis be in log scale} + +\item{type}{select \code{type = 'n'} to plot only figure frames} + +\item{ylab}{overwrites default y-axis label; can be a vector if multiple splines fitted} + +\item{xlab}{overwrites default x-axis label; can be a vector if multiple splines fitted} + +\item{...}{arguments passed on to plot()} +} +\description{ +Plot SIR splines using R base graphics. +} +\details{ +In \code{plot.sirspline} almost every graphical parameter are user +adjustable, such as \code{ylim}, \code{xlim}. +\code{plot.sirsplines} calls \code{lines.splines} to add lines. + +The plot axis without lines can be plotted using option \code{type = 'n'}. +On top of the frame it's then possible to add a \code{grid}, +\code{abline} or text before plotting the lines (see: \code{sirspline}). +} +\seealso{ +Other sir functions: \code{\link{lines.sirspline}}, + \code{\link{sir_exp}}, \code{\link{sir_ratio}}, + \code{\link{sirspline}}, \code{\link{sir}} +} +\author{ +Matti Rantanen +} diff --git a/man/plot.survmean.Rd b/man/plot.survmean.Rd index 668b430..4adb9d4 100644 --- a/man/plot.survmean.Rd +++ b/man/plot.survmean.Rd @@ -1,38 +1,38 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/S3_definitions.R -\name{plot.survmean} -\alias{plot.survmean} -\title{Graphically Inspect Curves Used in Mean Survival Computation} -\usage{ -\method{plot}{survmean}(x, ...) -} -\arguments{ -\item{x}{a \code{survmean} object} - -\item{...}{arguments passed (ultimately) to \code{matlines}; you -may, therefore, supply e.g. \code{xlab} through this, though arguments -such as \code{lty} and \code{col} will not work} -} -\description{ -Plots the observed (with extrapolation) and expected survival -curves for all strata in an object created by \code{\link{survmean}} -} -\details{ -For examples see \code{\link{survmean}}. This function is intended only -for graphically inspecting that the observed survival curves with extrapolation -and the expected survival curves have been sensibly computed in \code{survmean}. - -If you want finer control over the plotted curves, extract the curves from -the \code{survmean} output using - -\code{attr(x, "curves")} - -where \code{x} is a \code{survmean} object. -} -\seealso{ -Other survmean functions: \code{\link{lines.survmean}}, - \code{\link{survmean}} -} -\author{ -Joonas Miettinen -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/S3_definitions.R +\name{plot.survmean} +\alias{plot.survmean} +\title{Graphically Inspect Curves Used in Mean Survival Computation} +\usage{ +\method{plot}{survmean}(x, ...) +} +\arguments{ +\item{x}{a \code{survmean} object} + +\item{...}{arguments passed (ultimately) to \code{matlines}; you +may, therefore, supply e.g. \code{xlab} through this, though arguments +such as \code{lty} and \code{col} will not work} +} +\description{ +Plots the observed (with extrapolation) and expected survival +curves for all strata in an object created by \code{\link{survmean}} +} +\details{ +For examples see \code{\link{survmean}}. This function is intended only +for graphically inspecting that the observed survival curves with extrapolation +and the expected survival curves have been sensibly computed in \code{survmean}. + +If you want finer control over the plotted curves, extract the curves from +the \code{survmean} output using + +\code{attr(x, "curves")} + +where \code{x} is a \code{survmean} object. +} +\seealso{ +Other survmean functions: \code{\link{lines.survmean}}, + \code{\link{survmean}} +} +\author{ +Joonas Miettinen +} diff --git a/man/plot.survtab.Rd b/man/plot.survtab.Rd index 556044f..a20f2f3 100644 --- a/man/plot.survtab.Rd +++ b/man/plot.survtab.Rd @@ -1,63 +1,63 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/S3_definitions.R -\name{plot.survtab} -\alias{plot.survtab} -\title{\code{plot} method for survtab objects} -\usage{ -\method{plot}{survtab}(x, y = NULL, subset = NULL, conf.int = TRUE, - col = NULL, lty = NULL, ylab = NULL, xlab = NULL, ...) -} -\arguments{ -\item{x}{a \code{survtab} output object} - -\item{y}{survival a character vector of a variable names to plot; -e.g. \code{y = "r.e2"}} - -\item{subset}{a logical condition; \code{obj} is subset accordingly -before plotting; use this for limiting to specific strata, -e.g. \code{subset = sex == "male"}} - -\item{conf.int}{logical; if \code{TRUE}, also plots any confidence intervals -present in \code{obj} for variables in \code{y}} - -\item{col}{line colour; one value for each stratum; will be recycled} - -\item{lty}{line type; one value for each stratum; will be recycled} - -\item{ylab}{label for Y-axis} - -\item{xlab}{label for X-axis} - -\item{...}{additional arguments passed on to \code{plot} and -\code{lines.survtab}; e.g. \code{ylim} can be defined this way} -} -\description{ -Plotting for \code{survtab} objects -} -\examples{ -data(sire) -data(sibr) -si <- rbind(sire, sibr) -si$period <- cut(si$dg_date, as.Date(c("1993-01-01", "2004-01-01", "2013-01-01")), right = FALSE) -si$cancer <- c(rep("rectal", nrow(sire)), rep("breast", nrow(sibr))) -x <- lexpand(si, birth = bi_date, entry = dg_date, exit = ex_date, - status = status \%in\% 1:2, - fot = 0:5, aggre = list(cancer, period, fot)) -st <- survtab_ag(fot ~ cancer + period, data = x, - surv.method = "lifetable", surv.type = "surv.obs") - -plot(st, "surv.obs", subset = cancer == "breast", ylim = c(0.5, 1), col = "blue") -lines(st, "surv.obs", subset = cancer == "rectal", col = "red") - -## or -plot(st, "surv.obs", col = c(2,2,4,4), lty = c(1, 2, 1, 2)) -} -\seealso{ -Other survtab functions: \code{\link{lines.survtab}}, - \code{\link{print.survtab}}, - \code{\link{summary.survtab}}, \code{\link{survtab_ag}}, - \code{\link{survtab}} -} -\author{ -Joonas Miettinen -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/S3_definitions.R +\name{plot.survtab} +\alias{plot.survtab} +\title{\code{plot} method for survtab objects} +\usage{ +\method{plot}{survtab}(x, y = NULL, subset = NULL, conf.int = TRUE, + col = NULL, lty = NULL, ylab = NULL, xlab = NULL, ...) +} +\arguments{ +\item{x}{a \code{survtab} output object} + +\item{y}{survival a character vector of a variable names to plot; +e.g. \code{y = "r.e2"}} + +\item{subset}{a logical condition; \code{obj} is subset accordingly +before plotting; use this for limiting to specific strata, +e.g. \code{subset = sex == "male"}} + +\item{conf.int}{logical; if \code{TRUE}, also plots any confidence intervals +present in \code{obj} for variables in \code{y}} + +\item{col}{line colour; one value for each stratum; will be recycled} + +\item{lty}{line type; one value for each stratum; will be recycled} + +\item{ylab}{label for Y-axis} + +\item{xlab}{label for X-axis} + +\item{...}{additional arguments passed on to \code{plot} and +\code{lines.survtab}; e.g. \code{ylim} can be defined this way} +} +\description{ +Plotting for \code{survtab} objects +} +\examples{ +data(sire) +data(sibr) +si <- rbind(sire, sibr) +si$period <- cut(si$dg_date, as.Date(c("1993-01-01", "2004-01-01", "2013-01-01")), right = FALSE) +si$cancer <- c(rep("rectal", nrow(sire)), rep("breast", nrow(sibr))) +x <- lexpand(si, birth = bi_date, entry = dg_date, exit = ex_date, + status = status \%in\% 1:2, + fot = 0:5, aggre = list(cancer, period, fot)) +st <- survtab_ag(fot ~ cancer + period, data = x, + surv.method = "lifetable", surv.type = "surv.obs") + +plot(st, "surv.obs", subset = cancer == "breast", ylim = c(0.5, 1), col = "blue") +lines(st, "surv.obs", subset = cancer == "rectal", col = "red") + +## or +plot(st, "surv.obs", col = c(2,2,4,4), lty = c(1, 2, 1, 2)) +} +\seealso{ +Other survtab functions: \code{\link{lines.survtab}}, + \code{\link{print.survtab}}, + \code{\link{summary.survtab}}, \code{\link{survtab_ag}}, + \code{\link{survtab}} +} +\author{ +Joonas Miettinen +} diff --git a/man/poisson.ci.Rd b/man/poisson.ci.Rd index b2af4a7..92813c8 100644 --- a/man/poisson.ci.Rd +++ b/man/poisson.ci.Rd @@ -1,26 +1,26 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utility_functions.R -\name{poisson.ci} -\alias{poisson.ci} -\title{Get rate and exact Poisson confidence intervals} -\usage{ -poisson.ci(x, pt = 1, conf.level = 0.95) -} -\arguments{ -\item{x}{observed} - -\item{pt}{expected} - -\item{conf.level}{alpha level} -} -\description{ -Computes confidence intervals for Poisson rates -} -\examples{ - -poisson.ci(x = 4, pt = 5, conf.level = 0.95) - -} -\author{ -epitools -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utility_functions.R +\name{poisson.ci} +\alias{poisson.ci} +\title{Get rate and exact Poisson confidence intervals} +\usage{ +poisson.ci(x, pt = 1, conf.level = 0.95) +} +\arguments{ +\item{x}{observed} + +\item{pt}{expected} + +\item{conf.level}{alpha level} +} +\description{ +Computes confidence intervals for Poisson rates +} +\examples{ + +poisson.ci(x = 4, pt = 5, conf.level = 0.95) + +} +\author{ +epitools +} diff --git a/man/popEpi.Rd b/man/popEpi.Rd index 401b929..6830a90 100644 --- a/man/popEpi.Rd +++ b/man/popEpi.Rd @@ -1,34 +1,34 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/popEpi_package.r -\docType{package} -\name{popEpi} -\alias{popEpi} -\alias{popEpi-package} -\title{popEpi: Functions for large-scale epidemiological analysis} -\description{ -\pkg{popEpi} is built for the needs of registry-based (large-scale) -epidemiological analysis. This is in most part enabled by the -efficient \pkg{data.table} package for handling and aggregating large data sets. - -\pkg{popEpi} currently supplies some utility functions such as \code{\link{splitMulti}} -and \code{\link{get.yrs}} for preparing large data sets for epidemiological analysis. -Included are also a a few functions that can be used in -epidemiological analysis such as \code{\link{sir}} for estimating -standardized incidence/mortality ratios (SIRs/SMRs) and \code{\link{survtab}} for -estimating observed and relative/net survival as well as cumulative incidence -functions (CIFs). - -Since there are many benefits to using \code{data.tables}, \pkg{popEpi} returns -outputs by default in the \code{data.table} format where appropriate. -Since \code{data.table} -objects are usually modified by reference, this may have surprising side -effects for users uninitiated in using \code{data.table}. To ensure -that appropriate outputs are in the \code{data.frame} format, set -\code{options("popEpi.datatable" = FALSE)}. However, \code{data.table} -usage is recommended due to better performance and testing coverage. -\code{data.table} is used -by most functions internally in both cases. -} -\details{ -popEpi -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/popEpi_package.r +\docType{package} +\name{popEpi} +\alias{popEpi} +\alias{popEpi-package} +\title{popEpi: Functions for large-scale epidemiological analysis} +\description{ +\pkg{popEpi} is built for the needs of registry-based (large-scale) +epidemiological analysis. This is in most part enabled by the +efficient \pkg{data.table} package for handling and aggregating large data sets. + +\pkg{popEpi} currently supplies some utility functions such as \code{\link{splitMulti}} +and \code{\link{get.yrs}} for preparing large data sets for epidemiological analysis. +Included are also a a few functions that can be used in +epidemiological analysis such as \code{\link{sir}} for estimating +standardized incidence/mortality ratios (SIRs/SMRs) and \code{\link{survtab}} for +estimating observed and relative/net survival as well as cumulative incidence +functions (CIFs). + +Since there are many benefits to using \code{data.tables}, \pkg{popEpi} returns +outputs by default in the \code{data.table} format where appropriate. +Since \code{data.table} +objects are usually modified by reference, this may have surprising side +effects for users uninitiated in using \code{data.table}. To ensure +that appropriate outputs are in the \code{data.frame} format, set +\code{options("popEpi.datatable" = FALSE)}. However, \code{data.table} +usage is recommended due to better performance and testing coverage. +\code{data.table} is used +by most functions internally in both cases. +} +\details{ +popEpi +} diff --git a/man/pophaz.Rd b/man/pophaz.Rd index 78935f9..62b9352 100644 --- a/man/pophaz.Rd +++ b/man/pophaz.Rd @@ -1,57 +1,57 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pophaz.R -\name{pophaz} -\alias{pophaz} -\title{Expected / Population Hazard Data Sets Usage in \pkg{popEpi}} -\description{ -Several functions in \pkg{popEpi} make use of population or expected -hazards in computing the intended estimates (e.g. \code{\link{survtab}}). -This document explains using such data sets in this package. -} -\details{ -Population hazard data sets (pophaz for short) in \pkg{popEpi} should -be \code{data.frame}s in the "long" format where one of the columns must be -named \code{haz} (for hazard), and other columns define the values or -levels in variables relating to subjects in your data. For example, -\code{\link{popmort}} contains Finnish population mortality hazards -by sex, calendar year, and 1-year age group. - -\tabular{rrrr}{ -\code{sex} \tab \code{year} \tab \code{agegroup} \tab \code{haz} \cr -0 \tab 1951 \tab 0 \tab 0.036363176\cr -0 \tab 1951 \tab 1 \tab 0.003616547\cr -0 \tab 1951 \tab 2 \tab 0.002172384\cr -0 \tab 1951 \tab 3 \tab 0.001581249\cr -0 \tab 1951 \tab 4 \tab 0.001180690\cr -0 \tab 1951 \tab 5 \tab 0.001070595 -} - -The names of the columns should match to the names of the variables -that you have in your subject-level data. Time variables in your pophaz -may also correspond to \code{Lexis} time scales; see -\code{\link{survtab}}. - -Any time variables (as they usually have) should be coded consistently: -When using fractional years in your data, the time variables in your pophaz -must also be coded in fractional years. When using e.g. \code{Date}s in your -data, ensure that the pophaz time variables are coded at the level of days -(or \code{Date}s for calendar time). - -The \code{haz} variable in your pophaz should also be coded consistently -with the used time variables. E.g. \code{haz} values in life-tables -reported as deaths per person-year should be multiplied by 365.25 when -using day-level time variables. - -If you have your population hazards in a \code{ratetable} object -usable by functions in \pkg{survival} and \pkg{relsurv}, you may -transform them to long-format \code{data.frame}s using -\code{\link{as.data.frame.ratetable}}. Ensure, however, that the -created \code{haz} column is coded at the right level (events per -days or years typically). - -National statistical institutions, the WHO, and e.g. the Human -Life-Table Database supply life-table data. -} -\author{ -Joonas Miettinen -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pophaz.R +\name{pophaz} +\alias{pophaz} +\title{Expected / Population Hazard Data Sets Usage in \pkg{popEpi}} +\description{ +Several functions in \pkg{popEpi} make use of population or expected +hazards in computing the intended estimates (e.g. \code{\link{survtab}}). +This document explains using such data sets in this package. +} +\details{ +Population hazard data sets (pophaz for short) in \pkg{popEpi} should +be \code{data.frame}s in the "long" format where one of the columns must be +named \code{haz} (for hazard), and other columns define the values or +levels in variables relating to subjects in your data. For example, +\code{\link{popmort}} contains Finnish population mortality hazards +by sex, calendar year, and 1-year age group. + +\tabular{rrrr}{ +\code{sex} \tab \code{year} \tab \code{agegroup} \tab \code{haz} \cr +0 \tab 1951 \tab 0 \tab 0.036363176\cr +0 \tab 1951 \tab 1 \tab 0.003616547\cr +0 \tab 1951 \tab 2 \tab 0.002172384\cr +0 \tab 1951 \tab 3 \tab 0.001581249\cr +0 \tab 1951 \tab 4 \tab 0.001180690\cr +0 \tab 1951 \tab 5 \tab 0.001070595 +} + +The names of the columns should match to the names of the variables +that you have in your subject-level data. Time variables in your pophaz +may also correspond to \code{Lexis} time scales; see +\code{\link{survtab}}. + +Any time variables (as they usually have) should be coded consistently: +When using fractional years in your data, the time variables in your pophaz +must also be coded in fractional years. When using e.g. \code{Date}s in your +data, ensure that the pophaz time variables are coded at the level of days +(or \code{Date}s for calendar time). + +The \code{haz} variable in your pophaz should also be coded consistently +with the used time variables. E.g. \code{haz} values in life-tables +reported as deaths per person-year should be multiplied by 365.25 when +using day-level time variables. + +If you have your population hazards in a \code{ratetable} object +usable by functions in \pkg{survival} and \pkg{relsurv}, you may +transform them to long-format \code{data.frame}s using +\code{\link{as.data.frame.ratetable}}. Ensure, however, that the +created \code{haz} column is coded at the right level (events per +days or years typically). + +National statistical institutions, the WHO, and e.g. the Human +Life-Table Database supply life-table data. +} +\author{ +Joonas Miettinen +} diff --git a/man/popmort.Rd b/man/popmort.Rd index a0980a0..06dfd5d 100644 --- a/man/popmort.Rd +++ b/man/popmort.Rd @@ -1,28 +1,28 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/data_document.R -\name{popmort} -\alias{popmort} -\title{Population mortality rates in Finland 1951 - 2013 in 101 age groups and -by gender} -\format{\code{data.table} with columns -\itemize{ - \item \code{sex} gender coded as male, female (0, 1) - \item \code{year} calendar year - \item \code{agegroup} - coded 0 to 100; one-year age groups - \item \code{haz} the average population mortality rate per person year -}} -\source{ -Statistics Finland -} -\description{ -Population mortality rates in Finland 1951 - 2013 in 101 age groups and -by gender -} -\seealso{ -\code{\link{pophaz}} - -Other popEpi data: \code{\link{ICSS}}, - \code{\link{meanpop_fi}}, \code{\link{sibr}}, - \code{\link{sire}}, \code{\link{stdpop101}}, - \code{\link{stdpop18}} -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data_document.R +\name{popmort} +\alias{popmort} +\title{Population mortality rates in Finland 1951 - 2013 in 101 age groups and +by gender} +\format{\code{data.table} with columns +\itemize{ + \item \code{sex} gender coded as male, female (0, 1) + \item \code{year} calendar year + \item \code{agegroup} - coded 0 to 100; one-year age groups + \item \code{haz} the average population mortality rate per person year +}} +\source{ +Statistics Finland +} +\description{ +Population mortality rates in Finland 1951 - 2013 in 101 age groups and +by gender +} +\seealso{ +\code{\link{pophaz}} + +Other popEpi data: \code{\link{ICSS}}, + \code{\link{meanpop_fi}}, \code{\link{sibr}}, + \code{\link{sire}}, \code{\link{stdpop101}}, + \code{\link{stdpop18}} +} diff --git a/man/prepExpo.Rd b/man/prepExpo.Rd index 30a7f05..273e7d1 100644 --- a/man/prepExpo.Rd +++ b/man/prepExpo.Rd @@ -1,105 +1,105 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/splitting_utility_functions.R -\name{prepExpo} -\alias{prepExpo} -\title{Prepare Exposure Data for Aggregation} -\usage{ -prepExpo(lex, freezeScales = "work", cutScale = "per", - entry = min(get(cutScale)), exit = max(get(cutScale)), by = "lex.id", - breaks = NULL, freezeDummy = NULL, subset = NULL, verbose = FALSE, - ...) -} -\arguments{ -\item{lex}{a \code{\link[Epi]{Lexis}} object with ONLY periods of exposure -as rows; one or multiple rows per subject allowed} - -\item{freezeScales}{a character vector naming \code{Lexis} time scales of exposure -which should be frozen in periods where no exposure occurs (in the gap -time periods)} - -\item{cutScale}{the \code{Lexis} time scale along which the subject-specific -ultimate entry and exit times are specified} - -\item{entry}{an expression; the time of entry to follow-up which may be earlier, at, or after -the first time of exposure in \code{freezeScales}; evaluated separately -for each unique combination of \code{by}, so e.g. with -\code{entry = min(Var1)} and \code{by = "lex.id"} it -sets the \code{lex.id}-specific minima of \code{Var1} to be the original times -of entry for each \code{lex.id}} - -\item{exit}{the same as \code{entry} but for the ultimate exit time per unique -combination of \code{by}} - -\item{by}{a character vector indicating variable names in \code{lex}, -the unique combinations of which identify separate subjects for which -to fill gaps in the records from \code{entry} to \code{exit}; -for novices of \code{{\link{data.table}}}, this is passed to a -\code{data.table}'s \code{by} argument.} - -\item{breaks}{a named list of breaks; -e.g. \code{list(work = 0:20,per = 1995:2015)}; passed on to -\code{\link{splitMulti}} so see that function's help for more details} - -\item{freezeDummy}{a character string; specifies the name for a dummy variable -that this function will create and add to output which -identifies rows where the \code{freezeScales} are frozen and where not -(\code{0} implies not frozen, \code{1} implies frozen); -if \code{NULL}, no dummy is created} - -\item{subset}{a logical condition to subset data by before computations; -e.g. \code{subset = sex == "male"}} - -\item{verbose}{logical; if \code{TRUE}, the function is chatty and returns -some messages and timings during its run.} - -\item{...}{additional arguments passed on to \code{\link{splitMulti}}} -} -\value{ -Returns a \code{Lexis} object that has been split if \code{breaks} is specified. -The resulting time is also a \code{data.table} if -\code{options("popEpi.datatable") == TRUE} (see: \code{?popEpi}) -} -\description{ -\code{prepExpo} uses a \code{Lexis} object of periods of exposure -to fill gaps between the periods and overall entry and exit times without -accumulating exposure time in periods of no exposure, and splits the -result if requested. -} -\details{ -\code{prepExpo} is a convenience function for the purpose of eventually aggregating -person-time and events in categories of not only normally progressing -\code{Lexis} time scales but also some time scales which should not -progress sometimes. For example a person may work at a production facility -only intermittently, meaning exposure time (to work-related substances -for example) should not progress outside of periods of work. This allows for -e.g. a correct aggregation of person-time and events by categories of cumulative -time of exposure. - -Given a \code{Lexis} object containing rows (time lines) -where a subject is exposed to something (and NO periods without exposure), -fills any gaps between exposure periods for each unique combination of \code{by} -and the subject-specific "ultimate" \code{entry} and \code{exit} times, -"freezes" the cumulative exposure times in periods of no exposure, -and splits data using \code{breaks} passed to \code{\link{splitMulti}} -if requested. Results in a (split) \code{Lexis} object where \code{freezeScales} -do not progress in time periods where no exposure was recorded in \code{lex}. - -This function assumes that \code{entry} and \code{exit} arguments are the -same for each row within a unique combination of variables named in \code{by}. -E.g. with \code{by = "lex.id"} only each \code{lex.id} has a unique value -for \code{entry} and \code{exit} at most. - -The supplied \code{breaks} split the data using \code{splitMulti}, with -the exception that breaks supplied concerning any frozen time scales -ONLY split the rows where the time scales are not frozen. E.g. -with \code{freezeScales = "work"}, -\code{breaks = list(work = 0:10, cal = 1995:2010)} splits all rows over -\code{"cal"} but only non-frozen rows over \code{"work"}. - -Only supports frozen time scales that advance and freeze contemporaneously: -e.g. it would not currently be possible to take into account the cumulative -time working at a facility and the cumulative time doing a single task -at the facility, if the two are not exactly the same. On the other hand -one might use the same time scale for different exposure types, supply them -as separate rows, and identify the different exposures using a dummy variable. -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/splitting_utility_functions.R +\name{prepExpo} +\alias{prepExpo} +\title{Prepare Exposure Data for Aggregation} +\usage{ +prepExpo(lex, freezeScales = "work", cutScale = "per", + entry = min(get(cutScale)), exit = max(get(cutScale)), by = "lex.id", + breaks = NULL, freezeDummy = NULL, subset = NULL, verbose = FALSE, + ...) +} +\arguments{ +\item{lex}{a \code{\link[Epi]{Lexis}} object with ONLY periods of exposure +as rows; one or multiple rows per subject allowed} + +\item{freezeScales}{a character vector naming \code{Lexis} time scales of exposure +which should be frozen in periods where no exposure occurs (in the gap +time periods)} + +\item{cutScale}{the \code{Lexis} time scale along which the subject-specific +ultimate entry and exit times are specified} + +\item{entry}{an expression; the time of entry to follow-up which may be earlier, at, or after +the first time of exposure in \code{freezeScales}; evaluated separately +for each unique combination of \code{by}, so e.g. with +\code{entry = min(Var1)} and \code{by = "lex.id"} it +sets the \code{lex.id}-specific minima of \code{Var1} to be the original times +of entry for each \code{lex.id}} + +\item{exit}{the same as \code{entry} but for the ultimate exit time per unique +combination of \code{by}} + +\item{by}{a character vector indicating variable names in \code{lex}, +the unique combinations of which identify separate subjects for which +to fill gaps in the records from \code{entry} to \code{exit}; +for novices of \code{{\link{data.table}}}, this is passed to a +\code{data.table}'s \code{by} argument.} + +\item{breaks}{a named list of breaks; +e.g. \code{list(work = 0:20,per = 1995:2015)}; passed on to +\code{\link{splitMulti}} so see that function's help for more details} + +\item{freezeDummy}{a character string; specifies the name for a dummy variable +that this function will create and add to output which +identifies rows where the \code{freezeScales} are frozen and where not +(\code{0} implies not frozen, \code{1} implies frozen); +if \code{NULL}, no dummy is created} + +\item{subset}{a logical condition to subset data by before computations; +e.g. \code{subset = sex == "male"}} + +\item{verbose}{logical; if \code{TRUE}, the function is chatty and returns +some messages and timings during its run.} + +\item{...}{additional arguments passed on to \code{\link{splitMulti}}} +} +\value{ +Returns a \code{Lexis} object that has been split if \code{breaks} is specified. +The resulting time is also a \code{data.table} if +\code{options("popEpi.datatable") == TRUE} (see: \code{?popEpi}) +} +\description{ +\code{prepExpo} uses a \code{Lexis} object of periods of exposure +to fill gaps between the periods and overall entry and exit times without +accumulating exposure time in periods of no exposure, and splits the +result if requested. +} +\details{ +\code{prepExpo} is a convenience function for the purpose of eventually aggregating +person-time and events in categories of not only normally progressing +\code{Lexis} time scales but also some time scales which should not +progress sometimes. For example a person may work at a production facility +only intermittently, meaning exposure time (to work-related substances +for example) should not progress outside of periods of work. This allows for +e.g. a correct aggregation of person-time and events by categories of cumulative +time of exposure. + +Given a \code{Lexis} object containing rows (time lines) +where a subject is exposed to something (and NO periods without exposure), +fills any gaps between exposure periods for each unique combination of \code{by} +and the subject-specific "ultimate" \code{entry} and \code{exit} times, +"freezes" the cumulative exposure times in periods of no exposure, +and splits data using \code{breaks} passed to \code{\link{splitMulti}} +if requested. Results in a (split) \code{Lexis} object where \code{freezeScales} +do not progress in time periods where no exposure was recorded in \code{lex}. + +This function assumes that \code{entry} and \code{exit} arguments are the +same for each row within a unique combination of variables named in \code{by}. +E.g. with \code{by = "lex.id"} only each \code{lex.id} has a unique value +for \code{entry} and \code{exit} at most. + +The supplied \code{breaks} split the data using \code{splitMulti}, with +the exception that breaks supplied concerning any frozen time scales +ONLY split the rows where the time scales are not frozen. E.g. +with \code{freezeScales = "work"}, +\code{breaks = list(work = 0:10, cal = 1995:2010)} splits all rows over +\code{"cal"} but only non-frozen rows over \code{"work"}. + +Only supports frozen time scales that advance and freeze contemporaneously: +e.g. it would not currently be possible to take into account the cumulative +time working at a facility and the cumulative time doing a single task +at the facility, if the two are not exactly the same. On the other hand +one might use the same time scale for different exposure types, supply them +as separate rows, and identify the different exposures using a dummy variable. +} diff --git a/man/print.aggre.Rd b/man/print.aggre.Rd index 12a9565..e7ed140 100644 --- a/man/print.aggre.Rd +++ b/man/print.aggre.Rd @@ -1,27 +1,27 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/S3_definitions.R -\name{print.aggre} -\alias{print.aggre} -\title{Print an \code{aggre} Object} -\usage{ -\method{print}{aggre}(x, subset = NULL, ...) -} -\arguments{ -\item{x}{an \code{aggre} object} - -\item{subset}{a logical condition to subset results table by -before printing; use this to limit to a certain stratum. E.g. -\code{subset = sex == "male"}} - -\item{...}{arguments passed to \code{print.data.table}; try e.g. -\code{top = 2} for numbers of rows in head and tail printed -if the table is large, -\code{nrow = 100} for number of rows to print, etc.} -} -\description{ -Print method function for \code{aggre} objects; see -\code{\link{as.aggre}} and \code{\link{aggre}}. -} -\author{ -Joonas Miettinen -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/S3_definitions.R +\name{print.aggre} +\alias{print.aggre} +\title{Print an \code{aggre} Object} +\usage{ +\method{print}{aggre}(x, subset = NULL, ...) +} +\arguments{ +\item{x}{an \code{aggre} object} + +\item{subset}{a logical condition to subset results table by +before printing; use this to limit to a certain stratum. E.g. +\code{subset = sex == "male"}} + +\item{...}{arguments passed to \code{print.data.table}; try e.g. +\code{top = 2} for numbers of rows in head and tail printed +if the table is large, +\code{nrow = 100} for number of rows to print, etc.} +} +\description{ +Print method function for \code{aggre} objects; see +\code{\link{as.aggre}} and \code{\link{aggre}}. +} +\author{ +Joonas Miettinen +} diff --git a/man/print.rate.Rd b/man/print.rate.Rd index 1f09a83..2680bcb 100644 --- a/man/print.rate.Rd +++ b/man/print.rate.Rd @@ -1,24 +1,24 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/S3_definitions.R -\name{print.rate} -\alias{print.rate} -\title{Print an rate object} -\usage{ -\method{print}{rate}(x, subset = NULL, ...) -} -\arguments{ -\item{x}{an \code{rate} object} - -\item{subset}{a logical condition to subset results table by -before printing; use this to limit to a certain stratum. E.g. -\code{subset = sex == "female"}} - -\item{...}{arguments for data.tables print method, e.g. row.names = FALSE suppresses row numbers.} -} -\description{ -Print method function for \code{rate} objects; see -\code{\link{rate}}. -} -\author{ -Matti Rantanen -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/S3_definitions.R +\name{print.rate} +\alias{print.rate} +\title{Print an rate object} +\usage{ +\method{print}{rate}(x, subset = NULL, ...) +} +\arguments{ +\item{x}{an \code{rate} object} + +\item{subset}{a logical condition to subset results table by +before printing; use this to limit to a certain stratum. E.g. +\code{subset = sex == "female"}} + +\item{...}{arguments for data.tables print method, e.g. row.names = FALSE suppresses row numbers.} +} +\description{ +Print method function for \code{rate} objects; see +\code{\link{rate}}. +} +\author{ +Matti Rantanen +} diff --git a/man/print.survtab.Rd b/man/print.survtab.Rd index acae664..29d57f9 100644 --- a/man/print.survtab.Rd +++ b/man/print.survtab.Rd @@ -1,33 +1,33 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/S3_definitions.R -\name{print.survtab} -\alias{print.survtab} -\title{Print a survtab Object} -\usage{ -\method{print}{survtab}(x, subset = NULL, ...) -} -\arguments{ -\item{x}{a \code{survtab} object} - -\item{subset}{a logical condition to subset results table by -before printing; use this to limit to a certain stratum. E.g. -\code{subset = sex == "male"}} - -\item{...}{arguments passed to \code{print.data.table}; try e.g. -\code{top = 2} for numbers of rows in head and tail printed -if the table is large, -\code{nrow = 100} for number of rows to print, etc.} -} -\description{ -Print method function for \code{survtab} objects; see -\code{\link{survtab_ag}}. -} -\seealso{ -Other survtab functions: \code{\link{lines.survtab}}, - \code{\link{plot.survtab}}, - \code{\link{summary.survtab}}, \code{\link{survtab_ag}}, - \code{\link{survtab}} -} -\author{ -Joonas Miettinen -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/S3_definitions.R +\name{print.survtab} +\alias{print.survtab} +\title{Print a survtab Object} +\usage{ +\method{print}{survtab}(x, subset = NULL, ...) +} +\arguments{ +\item{x}{a \code{survtab} object} + +\item{subset}{a logical condition to subset results table by +before printing; use this to limit to a certain stratum. E.g. +\code{subset = sex == "male"}} + +\item{...}{arguments passed to \code{print.data.table}; try e.g. +\code{top = 2} for numbers of rows in head and tail printed +if the table is large, +\code{nrow = 100} for number of rows to print, etc.} +} +\description{ +Print method function for \code{survtab} objects; see +\code{\link{survtab_ag}}. +} +\seealso{ +Other survtab functions: \code{\link{lines.survtab}}, + \code{\link{plot.survtab}}, + \code{\link{summary.survtab}}, \code{\link{survtab_ag}}, + \code{\link{survtab}} +} +\author{ +Joonas Miettinen +} diff --git a/man/rate.Rd b/man/rate.Rd index b7ba7d9..098baac 100644 --- a/man/rate.Rd +++ b/man/rate.Rd @@ -1,104 +1,108 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/incidence_rates.R -\name{rate} -\alias{rate} -\title{Direct-Standardised Incidence/Mortality Rates} -\usage{ -rate(data, obs = NULL, pyrs = NULL, print = NULL, adjust = NULL, - weights = NULL, subset = NULL) -} -\arguments{ -\item{data}{aggregated data (see e.g. \code{\link{lexpand}}, -\code{\link{aggre}} if you have subject-level data)} - -\item{obs}{observations variable name in data. -\link[=flexible_argument]{Flexible input}, typically e.g. -\code{obs = obs}.} - -\item{pyrs}{person-years variable name in data. -\link[=flexible_argument]{Flexible input}, typically e.g. -\code{pyrs = pyrs}.} - -\item{print}{variable name to stratify the rates. -\link[=flexible_argument]{Flexible input}, typically e.g. -\code{print = sex} or \code{print = list(sex, area)}.} - -\item{adjust}{variable for adjusting the rates. -\link[=flexible_argument]{Flexible input}, typically e.g. -\code{adjust = agegroup}.} - -\item{weights}{typically a list of weights or a \code{character} string -specifying an age group standardization scheme; see -the \link[=direct_standardization]{dedicated help page} -and examples.} - -\item{subset}{a logical expression to subset data.} -} -\value{ -Returns a \code{data.table} with observations, person-years, rates and -adjusted rates, if available. Results are stratified by \code{print}. -Adjusted rates are identified with suffix \code{.adj} and -\code{.lo} and \code{.hi} are for confidence intervals lower and upper -95\% bounds, respectively. -The prefix \code{SE.} stands for standard error. -} -\description{ -\code{rate} calculates adjusted rates using -preloaded weights data or user specified weights. -} -\details{ -Input data needs to be in aggregated format with observations -and person-years. For individual data use \code{\link{lexpand}}, or -\code{\link{ltable}} and merge person-years manually. -} -\examples{ -## Prepare data with lexpand and then reformat agegroup. -data(sibr) -x <- lexpand(sibr, birth = bi_date, entry = dg_date, exit = ex_date, - breaks = list(per = c(1990,2000,2010,2020), age = c(0:17*5,Inf)), - aggre = list(agegroup = age, year.cat = per), - status = status != 0) - -x$agegroup <- cut(x$agegroup, c(0:17*5,Inf), right = FALSE) - -## calculate rates for selected periods with Nordic 2000 weights: -r1 <- rate( data = x, obs = from0to1, pyrs = pyrs, print = year.cat, - adjust = agegroup, weights = 'nordic') -r1 - -## use total person-years by stratum as weights (some have zero) -w <- ltable(x, by.vars = "agegroup", expr = sum(pyrs)) -w[is.na(w$V1),]$V1 <- 0 - -r2 <- rate( data = x, obs = from0to1, pyrs = pyrs, print = year.cat, - adjust = agegroup, - weights = w$V1) -r2 - -## use data.frame of weights: -names(w) <- c("agegroup", "weights") -r2 <- rate( data = x, obs = from0to1, pyrs = pyrs, print = year.cat, - adjust = agegroup, - weights = w) -r2 - -## internal weights (same result as above) -r3 <- rate( data = x, obs = from0to1, pyrs = pyrs, print = year.cat, - adjust = agegroup, - weights = "internal") -r3 - -} -\seealso{ -\code{\link{lexpand}}, \code{\link{ltable}} - -Other main functions: \code{\link{relpois_ag}}, - \code{\link{relpois}}, \code{\link{sirspline}}, - \code{\link{sir}}, \code{\link{survmean}}, - \code{\link{survtab_ag}}, \code{\link{survtab}} - -Other rate functions: \code{\link{rate_ratio}} -} -\author{ -Matti Rantanen, Joonas Miettinen -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/incidence_rates.R +\name{rate} +\alias{rate} +\title{Direct-Standardised Incidence/Mortality Rates} +\usage{ +rate(data, obs = NULL, pyrs = NULL, print = NULL, adjust = NULL, + weights = NULL, subset = NULL) +} +\arguments{ +\item{data}{aggregated data (see e.g. \code{\link{lexpand}}, +\code{\link{aggre}} if you have subject-level data)} + +\item{obs}{observations variable name in data. +\link[=flexible_argument]{Flexible input}, typically e.g. +\code{obs = obs}.} + +\item{pyrs}{person-years variable name in data. +\link[=flexible_argument]{Flexible input}, typically e.g. +\code{pyrs = pyrs}.} + +\item{print}{variable name to stratify the rates. +\link[=flexible_argument]{Flexible input}, typically e.g. +\code{print = sex} or \code{print = list(sex, area)}.} + +\item{adjust}{variable for adjusting the rates. +\link[=flexible_argument]{Flexible input}, typically e.g. +\code{adjust = agegroup}.} + +\item{weights}{typically a list of weights or a \code{character} string +specifying an age group standardization scheme; see +the \link[=direct_standardization]{dedicated help page} +and examples.} + +\item{subset}{a logical expression to subset data.} +} +\value{ +Returns a \code{data.table} with observations, person-years, rates and +adjusted rates, if available. Results are stratified by \code{print}. +Adjusted rates are identified with suffix \code{.adj} and +\code{.lo} and \code{.hi} are for confidence intervals lower and upper +95\% bounds, respectively. +The prefix \code{SE.} stands for standard error. +} +\description{ +\code{rate} calculates adjusted rates using +preloaded weights data or user specified weights. +} +\details{ +Input data needs to be in aggregated format with observations +and person-years. For individual data use \code{\link{lexpand}}, or +\code{\link{ltable}} and merge person-years manually. + +The confidence intervals are based on the normal approximation of the logarithm of the rate. +The variance of the log rate that is used to derive the confidence intervals +is derived using the delta method. +} +\examples{ +## Prepare data with lexpand and then reformat agegroup. +data(sibr) +x <- lexpand(sibr, birth = bi_date, entry = dg_date, exit = ex_date, + breaks = list(per = c(1990,2000,2010,2020), age = c(0:17*5,Inf)), + aggre = list(agegroup = age, year.cat = per), + status = status != 0) + +x$agegroup <- cut(x$agegroup, c(0:17*5,Inf), right = FALSE) + +## calculate rates for selected periods with Nordic 2000 weights: +r1 <- rate( data = x, obs = from0to1, pyrs = pyrs, print = year.cat, + adjust = agegroup, weights = 'nordic') +r1 + +## use total person-years by stratum as weights (some have zero) +w <- ltable(x, by.vars = "agegroup", expr = sum(pyrs)) +w[is.na(w$V1),]$V1 <- 0 + +r2 <- rate( data = x, obs = from0to1, pyrs = pyrs, print = year.cat, + adjust = agegroup, + weights = w$V1) +r2 + +## use data.frame of weights: +names(w) <- c("agegroup", "weights") +r2 <- rate( data = x, obs = from0to1, pyrs = pyrs, print = year.cat, + adjust = agegroup, + weights = w) +r2 + +## internal weights (same result as above) +r3 <- rate( data = x, obs = from0to1, pyrs = pyrs, print = year.cat, + adjust = agegroup, + weights = "internal") +r3 + +} +\seealso{ +\code{\link{lexpand}}, \code{\link{ltable}} + +Other main functions: \code{\link{relpois_ag}}, + \code{\link{relpois}}, \code{\link{sirspline}}, + \code{\link{sir}}, \code{\link{survmean}}, + \code{\link{survtab_ag}}, \code{\link{survtab}} + +Other rate functions: \code{\link{rate_ratio}} +} +\author{ +Matti Rantanen, Joonas Miettinen +} diff --git a/man/rate_ratio.Rd b/man/rate_ratio.Rd index 48da4d3..6539585 100644 --- a/man/rate_ratio.Rd +++ b/man/rate_ratio.Rd @@ -1,70 +1,70 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/incidence_rates_utils.R -\name{rate_ratio} -\alias{rate_ratio} -\title{Confidence intervals for the rate ratios} -\usage{ -rate_ratio(x, y, crude = FALSE, SE.method = TRUE) -} -\arguments{ -\item{x}{a rate-object, vector of two; rate and standard error or observed and person-years.} - -\item{y}{a rate-object, vector of two; rate and standard error or observed and person-years.} - -\item{crude}{set TRUE to use crude rates; default is FALSE.} - -\item{SE.method}{default TRUE; if \code{x} and \code{y} are vectors of observed and -person-years, this must be changed to FALSE.} -} -\value{ -A vector length of three: rate_ratio, and lower and upper confidence intervals. -} -\description{ -Calculate rate ratio with confidence intervals for rate objects or observations and person-years. -} -\details{ -Calculate rate ratio of two age standardized rate objects (see \code{\link{rate}}). -Multiple rates for each objects is supported if there are an equal number of rates. -Another option is to set \code{x} and \code{y} as a vector of two. -\enumerate{ - \item rate and its standard error, and set \code{SE.method = TRUE}. - \item observations and person-year, and set \code{SE.method = FALSE}. -} -See examples. -} -\examples{ -\dontrun{ -# two rate ratios; silly example with female rectal / breast cancer -## mortality rates -data("sire", package = "popEpi") -data("sibr", package = "popEpi") - -BL <- list(per = 2000:2005) - -re <- lexpand(sire, birth = "bi_date", entry = "dg_date", exit = "ex_date", - status = status == 1, breaks = BL, aggre = list(per)) -br <- lexpand(sibr, birth = "bi_date", entry = "dg_date", exit = "ex_date", - status = status == 1, breaks = BL, aggre = list(per)) - -r_re <- rate(re, obs = "from0to1", pyrs = "pyrs") -r_br <- rate(br, obs = "from0to1", pyrs = "pyrs") - -rate_ratio(r_re, r_br, SE.method = TRUE) -} - -# manually set rates (0.003 and 0.005) and SEs (0.001 and 0.002) -# so that x = y = c('rate', 'SE') -rate_ratio(x= c(0.003, 0.001), y= c(0.005, 0.002), SE.method = TRUE) - -# observed numbers (10 and 20) and person-years (30000 and 40000): -rate_ratio(x = c(10, 30000), y = c(20, 40000), SE.method = FALSE) - -} -\seealso{ -\code{\link{rate}} - -Other rate functions: \code{\link{rate}} -} -\author{ -Matti Rantanen -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/incidence_rates_utils.R +\name{rate_ratio} +\alias{rate_ratio} +\title{Confidence intervals for the rate ratios} +\usage{ +rate_ratio(x, y, crude = FALSE, SE.method = TRUE) +} +\arguments{ +\item{x}{a rate-object, vector of two; rate and standard error or observed and person-years.} + +\item{y}{a rate-object, vector of two; rate and standard error or observed and person-years.} + +\item{crude}{set TRUE to use crude rates; default is FALSE.} + +\item{SE.method}{default TRUE; if \code{x} and \code{y} are vectors of observed and +person-years, this must be changed to FALSE.} +} +\value{ +A vector length of three: rate_ratio, and lower and upper confidence intervals. +} +\description{ +Calculate rate ratio with confidence intervals for rate objects or observations and person-years. +} +\details{ +Calculate rate ratio of two age standardized rate objects (see \code{\link{rate}}). +Multiple rates for each objects is supported if there are an equal number of rates. +Another option is to set \code{x} and \code{y} as a vector of two. +\enumerate{ + \item rate and its standard error, and set \code{SE.method = TRUE}. + \item observations and person-year, and set \code{SE.method = FALSE}. +} +See examples. +} +\examples{ +\dontrun{ +# two rate ratios; silly example with female rectal / breast cancer +## mortality rates +data("sire", package = "popEpi") +data("sibr", package = "popEpi") + +BL <- list(per = 2000:2005) + +re <- lexpand(sire, birth = "bi_date", entry = "dg_date", exit = "ex_date", + status = status == 1, breaks = BL, aggre = list(per)) +br <- lexpand(sibr, birth = "bi_date", entry = "dg_date", exit = "ex_date", + status = status == 1, breaks = BL, aggre = list(per)) + +r_re <- rate(re, obs = "from0to1", pyrs = "pyrs") +r_br <- rate(br, obs = "from0to1", pyrs = "pyrs") + +rate_ratio(r_re, r_br, SE.method = TRUE) +} + +# manually set rates (0.003 and 0.005) and SEs (0.001 and 0.002) +# so that x = y = c('rate', 'SE') +rate_ratio(x= c(0.003, 0.001), y= c(0.005, 0.002), SE.method = TRUE) + +# observed numbers (10 and 20) and person-years (30000 and 40000): +rate_ratio(x = c(10, 30000), y = c(20, 40000), SE.method = FALSE) + +} +\seealso{ +\code{\link{rate}} + +Other rate functions: \code{\link{rate}} +} +\author{ +Matti Rantanen +} diff --git a/man/relpois.Rd b/man/relpois.Rd index 0efa3d9..2ddc6a5 100644 --- a/man/relpois.Rd +++ b/man/relpois.Rd @@ -1,117 +1,117 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/relative_poisson.R -\name{relpois} -\alias{relpois} -\title{Excess hazard Poisson model} -\usage{ -relpois(data, formula, fot.breaks = NULL, subset = NULL, check = TRUE, - ...) -} -\arguments{ -\item{data}{a dataset split with e.g. \code{\link{lexpand}}; -must have expected hazard merged within} - -\item{formula}{a formula which is passed on to \code{glm}; see Details} - -\item{fot.breaks}{optional; a numeric vector of [a,b) breaks to specify -survival intervals over the follow-up time; if \code{NULL}, the -existing breaks along the mandatory \code{fot} time scale in \code{data} -are used (e.g. the breaks for \code{fot} supplied to \code{lexpand})} - -\item{subset}{a logical vector or condition; e.g. \code{subset = sex == 1}; -limits the data before estimation} - -\item{check}{logical; if \code{TRUE}, tabulates excess cases by all -factor variables in the formula to check for negative / \code{NA} -excess cases before fitting the GLM} - -\item{...}{any argument passed on to \code{glm}} -} -\value{ -A \code{glm} object created using a custom Poisson family construct. Some -\code{glm} methods are applicable. -} -\description{ -Estimate a Poisson piecewise constant excess -hazards model -} -\details{ -\strong{Basics} - -\code{relpois} employs a custom link function of the Poisson variety -to estimate piecewise constant parametric excess hazards. The pieces -are determined by \code{fot.breaks}. A \code{log(person-years)} offset -is passed automatically to the \code{glm} call. - -\strong{Formula usage} - -The formula can be used like any ordinary \code{glm} formula. The user must -define the outcome in some manner, which is usually \code{lex.Xst} after splitting -with e.g. \code{lexpand}. The exception is the possibility of including -the baseline excess hazard terms by including the -reserved term \code{FOT} in the formula. - -For example, \code{lex.Xst != 0 ~ FOT + agegr} estimates a model with constant -excess hazards at the follow-up intervals as specified by -the pertinent breaks used in splitting \code{data}, -as well as for the different age groups. -\code{FOT} is created ad hoc if it is used in the formula. -If you leave out \code{FOT}, the hazard is effectively -assumed to be constant across the whole follow-up time. - -You can also simply use your own follow-up time interval variable that -you have created before calling \code{relpois}. However, when using -\code{FOT}, \code{relpois} automatically checks for e.g. -negative excess cases in follow-up intervals, -allowing for quickly finding splitting breaks -where model estimation is possible. It also drops any data outside the -follow-up time window. - -\strong{Splitting and merging population hazard} - -The easiest way to both split and to include population hazard information is -by using \code{\link{lexpand}}. You may also fairly easily do it by hand -by splitting first and then merging in your population hazard information. - - -\strong{Data requirements} - -The population hazard information must be available for each record and named -\code{pop.haz}. The follow-up time variable must be named \code{"fot"} e.g. -as a result of using \code{lexpand}. The \code{lex.dur} variable must also -be present, containing person-year information. -} -\examples{ -## use the simulated rectal cancer cohort -data("sire", package = "popEpi") -sire$agegr <- cut(sire$dg_age, c(0,45,60,Inf), right=FALSE) - -## usable straight away after splitting -fb <- c(0,3/12,6/12,1,2,3,4,5) -x <- lexpand(sire, birth = bi_date, entry = dg_date, - exit = ex_date, status=status, - breaks = list(fot=fb), pophaz=popmort) -rpm <- relpois(x, formula = lex.Xst \%in\% 1:2 ~ FOT + agegr) - -## some methods for glm work. e.g. test for interaction -\dontrun{ -rpm2 <- relpois(x, formula = lex.Xst \%in\% 1:2 ~ FOT*agegr) -anova(rpm, rpm2, test="LRT") -AIC(rpm, rpm2) -## update won't work currently -} -} -\seealso{ -\code{\link{lexpand}}, \code{\link{poisson}}, \code{\link{glm}} - -Other main functions: \code{\link{rate}}, - \code{\link{relpois_ag}}, \code{\link{sirspline}}, - \code{\link{sir}}, \code{\link{survmean}}, - \code{\link{survtab_ag}}, \code{\link{survtab}} - -Other relpois functions: \code{\link{RPL}}, - \code{\link{relpois_ag}}, \code{\link{rpcurve}} -} -\author{ -Joonas Miettinen, Karri Seppa -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/relative_poisson.R +\name{relpois} +\alias{relpois} +\title{Excess hazard Poisson model} +\usage{ +relpois(data, formula, fot.breaks = NULL, subset = NULL, check = TRUE, + ...) +} +\arguments{ +\item{data}{a dataset split with e.g. \code{\link{lexpand}}; +must have expected hazard merged within} + +\item{formula}{a formula which is passed on to \code{glm}; see Details} + +\item{fot.breaks}{optional; a numeric vector of [a,b) breaks to specify +survival intervals over the follow-up time; if \code{NULL}, the +existing breaks along the mandatory \code{fot} time scale in \code{data} +are used (e.g. the breaks for \code{fot} supplied to \code{lexpand})} + +\item{subset}{a logical vector or condition; e.g. \code{subset = sex == 1}; +limits the data before estimation} + +\item{check}{logical; if \code{TRUE}, tabulates excess cases by all +factor variables in the formula to check for negative / \code{NA} +excess cases before fitting the GLM} + +\item{...}{any argument passed on to \code{glm}} +} +\value{ +A \code{glm} object created using a custom Poisson family construct. Some +\code{glm} methods are applicable. +} +\description{ +Estimate a Poisson piecewise constant excess +hazards model +} +\details{ +\strong{Basics} + +\code{relpois} employs a custom link function of the Poisson variety +to estimate piecewise constant parametric excess hazards. The pieces +are determined by \code{fot.breaks}. A \code{log(person-years)} offset +is passed automatically to the \code{glm} call. + +\strong{Formula usage} + +The formula can be used like any ordinary \code{glm} formula. The user must +define the outcome in some manner, which is usually \code{lex.Xst} after splitting +with e.g. \code{lexpand}. The exception is the possibility of including +the baseline excess hazard terms by including the +reserved term \code{FOT} in the formula. + +For example, \code{lex.Xst != 0 ~ FOT + agegr} estimates a model with constant +excess hazards at the follow-up intervals as specified by +the pertinent breaks used in splitting \code{data}, +as well as for the different age groups. +\code{FOT} is created ad hoc if it is used in the formula. +If you leave out \code{FOT}, the hazard is effectively +assumed to be constant across the whole follow-up time. + +You can also simply use your own follow-up time interval variable that +you have created before calling \code{relpois}. However, when using +\code{FOT}, \code{relpois} automatically checks for e.g. +negative excess cases in follow-up intervals, +allowing for quickly finding splitting breaks +where model estimation is possible. It also drops any data outside the +follow-up time window. + +\strong{Splitting and merging population hazard} + +The easiest way to both split and to include population hazard information is +by using \code{\link{lexpand}}. You may also fairly easily do it by hand +by splitting first and then merging in your population hazard information. + + +\strong{Data requirements} + +The population hazard information must be available for each record and named +\code{pop.haz}. The follow-up time variable must be named \code{"fot"} e.g. +as a result of using \code{lexpand}. The \code{lex.dur} variable must also +be present, containing person-year information. +} +\examples{ +## use the simulated rectal cancer cohort +data("sire", package = "popEpi") +sire$agegr <- cut(sire$dg_age, c(0,45,60,Inf), right=FALSE) + +## usable straight away after splitting +fb <- c(0,3/12,6/12,1,2,3,4,5) +x <- lexpand(sire, birth = bi_date, entry = dg_date, + exit = ex_date, status=status, + breaks = list(fot=fb), pophaz=popmort) +rpm <- relpois(x, formula = lex.Xst \%in\% 1:2 ~ FOT + agegr) + +## some methods for glm work. e.g. test for interaction +\dontrun{ +rpm2 <- relpois(x, formula = lex.Xst \%in\% 1:2 ~ FOT*agegr) +anova(rpm, rpm2, test="LRT") +AIC(rpm, rpm2) +## update won't work currently +} +} +\seealso{ +\code{\link{lexpand}}, \code{\link{poisson}}, \code{\link{glm}} + +Other main functions: \code{\link{rate}}, + \code{\link{relpois_ag}}, \code{\link{sirspline}}, + \code{\link{sir}}, \code{\link{survmean}}, + \code{\link{survtab_ag}}, \code{\link{survtab}} + +Other relpois functions: \code{\link{RPL}}, + \code{\link{relpois_ag}}, \code{\link{rpcurve}} +} +\author{ +Joonas Miettinen, Karri Seppa +} diff --git a/man/relpois_ag.Rd b/man/relpois_ag.Rd index 2928a89..4267c52 100644 --- a/man/relpois_ag.Rd +++ b/man/relpois_ag.Rd @@ -1,94 +1,94 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/relative_poisson.R -\name{relpois_ag} -\alias{relpois_ag} -\title{Excess hazard Poisson model} -\usage{ -relpois_ag(formula, data, d.exp, offset = NULL, breaks = NULL, - subset = NULL, piecewise = TRUE, check = TRUE, ...) -} -\arguments{ -\item{formula}{a formula with the counts of events as the response. -Passed on to \code{glm}. May contain usage of the \code{offset()} function -instead of supplying the offset for the Poisson model via the argument -\code{offset}.} - -\item{data}{an \code{aggre} object (an aggregated data set; -see \code{\link{as.aggre}} and \code{\link{aggre}})} - -\item{d.exp}{the counts of expected cases. Mandatory. -E.g. \code{d.exp = EXC_CASES}, where \code{EXC_CASES} is a column in data.} - -\item{offset}{the offset for the Poisson model, supplied as e.g. -\code{offset = log(PTIME)}, where \code{PTIME} is a subject-time -variable in data. Not mandatory, but almost always should be supplied.} - -\item{breaks}{optional; a numeric vector of [a,b) breaks to specify -survival intervals over the follow-up time; if \code{NULL}, the -existing breaks along the mandatory time scale mentioned in \code{formula} -are used} - -\item{subset}{a logical vector or condition; e.g. \code{subset = sex == 1}; -limits the data before estimation} - -\item{piecewise}{\code{logical}; if \code{TRUE}, and if any time scale -from data is used (mentioned) in the formula, the time scale is -transformed into a factor variable indicating intervals on the time scale. -Otherwise the time scale left as it is, usually a numeric variable. -E.g. if \code{formula = counts ~ TS1*VAR1}, \code{TS1} is transformed -into a factor before fitting model.} - -\item{check}{\code{logical}; if \code{TRUE}, performs check on the -negativity excess cases by factor-like covariates in formula - -negative excess cases will very likely lead to non-converging model} - -\item{...}{any other argument passed on to \code{\link[stats]{glm}} such as -\code{control} or \code{weights}} -} -\value{ -A \code{relpois} object created using a custom Poisson family construct. -} -\description{ -Estimate a Poisson Piecewise Constant Excess -Hazards Model -} -\examples{ -## use the simulated rectal cancer cohort -data(sire, package = "popEpi") -sire$agegr <- cut(sire$dg_age, c(0,45,60,Inf), right=FALSE) - -## create aggregated example data -fb <- c(0,3/12,6/12,1,2,3,4,5) -x <- lexpand(sire, birth = bi_date, entry = dg_date, - exit = ex_date, status=status \%in\% 1:2, - breaks = list(fot=fb), - pophaz=popmort, pp = FALSE, - aggre = list(agegr, fot)) - -## fit model using aggregated data -rpm <- relpois_ag(formula = from0to1 ~ fot + agegr, data = x, - d.exp = d.exp, offset = log(pyrs)) -summary(rpm) - -## the usual functions for handling glm models work -rpm2 <- update(rpm, . ~ fot*agegr) -anova(rpm, rpm2, test="LRT") -AIC(rpm, rpm2) - -## other features such as residuals or predicting are not guaranteed -## to work as intended. -} -\seealso{ -\code{\link{lexpand}}, \code{\link{poisson}}, \code{\link{glm}} - -Other main functions: \code{\link{rate}}, - \code{\link{relpois}}, \code{\link{sirspline}}, - \code{\link{sir}}, \code{\link{survmean}}, - \code{\link{survtab_ag}}, \code{\link{survtab}} - -Other relpois functions: \code{\link{RPL}}, - \code{\link{relpois}}, \code{\link{rpcurve}} -} -\author{ -Joonas Miettinen, Karri Seppa -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/relative_poisson.R +\name{relpois_ag} +\alias{relpois_ag} +\title{Excess hazard Poisson model} +\usage{ +relpois_ag(formula, data, d.exp, offset = NULL, breaks = NULL, + subset = NULL, piecewise = TRUE, check = TRUE, ...) +} +\arguments{ +\item{formula}{a formula with the counts of events as the response. +Passed on to \code{glm}. May contain usage of the \code{offset()} function +instead of supplying the offset for the Poisson model via the argument +\code{offset}.} + +\item{data}{an \code{aggre} object (an aggregated data set; +see \code{\link{as.aggre}} and \code{\link{aggre}})} + +\item{d.exp}{the counts of expected cases. Mandatory. +E.g. \code{d.exp = EXC_CASES}, where \code{EXC_CASES} is a column in data.} + +\item{offset}{the offset for the Poisson model, supplied as e.g. +\code{offset = log(PTIME)}, where \code{PTIME} is a subject-time +variable in data. Not mandatory, but almost always should be supplied.} + +\item{breaks}{optional; a numeric vector of [a,b) breaks to specify +survival intervals over the follow-up time; if \code{NULL}, the +existing breaks along the mandatory time scale mentioned in \code{formula} +are used} + +\item{subset}{a logical vector or condition; e.g. \code{subset = sex == 1}; +limits the data before estimation} + +\item{piecewise}{\code{logical}; if \code{TRUE}, and if any time scale +from data is used (mentioned) in the formula, the time scale is +transformed into a factor variable indicating intervals on the time scale. +Otherwise the time scale left as it is, usually a numeric variable. +E.g. if \code{formula = counts ~ TS1*VAR1}, \code{TS1} is transformed +into a factor before fitting model.} + +\item{check}{\code{logical}; if \code{TRUE}, performs check on the +negativity excess cases by factor-like covariates in formula - +negative excess cases will very likely lead to non-converging model} + +\item{...}{any other argument passed on to \code{\link[stats]{glm}} such as +\code{control} or \code{weights}} +} +\value{ +A \code{relpois} object created using a custom Poisson family construct. +} +\description{ +Estimate a Poisson Piecewise Constant Excess +Hazards Model +} +\examples{ +## use the simulated rectal cancer cohort +data(sire, package = "popEpi") +sire$agegr <- cut(sire$dg_age, c(0,45,60,Inf), right=FALSE) + +## create aggregated example data +fb <- c(0,3/12,6/12,1,2,3,4,5) +x <- lexpand(sire, birth = bi_date, entry = dg_date, + exit = ex_date, status=status \%in\% 1:2, + breaks = list(fot=fb), + pophaz=popmort, pp = FALSE, + aggre = list(agegr, fot)) + +## fit model using aggregated data +rpm <- relpois_ag(formula = from0to1 ~ fot + agegr, data = x, + d.exp = d.exp, offset = log(pyrs)) +summary(rpm) + +## the usual functions for handling glm models work +rpm2 <- update(rpm, . ~ fot*agegr) +anova(rpm, rpm2, test="LRT") +AIC(rpm, rpm2) + +## other features such as residuals or predicting are not guaranteed +## to work as intended. +} +\seealso{ +\code{\link{lexpand}}, \code{\link{poisson}}, \code{\link{glm}} + +Other main functions: \code{\link{rate}}, + \code{\link{relpois}}, \code{\link{sirspline}}, + \code{\link{sir}}, \code{\link{survmean}}, + \code{\link{survtab_ag}}, \code{\link{survtab}} + +Other relpois functions: \code{\link{RPL}}, + \code{\link{relpois}}, \code{\link{rpcurve}} +} +\author{ +Joonas Miettinen, Karri Seppa +} diff --git a/man/robust_values.Rd b/man/robust_values.Rd index 2739879..3af3dcd 100644 --- a/man/robust_values.Rd +++ b/man/robust_values.Rd @@ -1,45 +1,45 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utility_functions.R -\name{robust_values} -\alias{robust_values} -\title{Convert values to numeric robustly} -\usage{ -robust_values(num.values, force = FALSE, messages = TRUE) -} -\arguments{ -\item{num.values}{values to convert to numeric} - -\item{force}{logical; if \code{TRUE}, returns a vector of values where values that cannot be interpreted as numeric are -set to \code{NA}; if \code{FALSE}, returns the original vector and gives a warning if any value cannot be interpreted as -numeric.} - -\item{messages}{logical; if \code{TRUE}, returns a message of what was done with the \code{num.values}} -} -\description{ -Brute force solution for ensuring a variable is numeric by -coercing a variable of any type first to factor and then to numeric -} -\note{ -Returns \code{NULL} if given \code{num.values} is \code{NULL}. -} -\examples{ -## this works -values <- c("1", "3", "5") -values <- robust_values(values) - -## this works -values <- c("1", "3", "5", NA) -values <- robust_values(values) - -## this returns originals -values <- c("1", "3", "5", "a") -values <- robust_values(values) - -## this forces "a" to NA and works otherwise -values <- c("1", "3", "5", "a") -values <- robust_values(values, force=TRUE) - -} -\author{ -Joonas Miettinen -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utility_functions.R +\name{robust_values} +\alias{robust_values} +\title{Convert values to numeric robustly} +\usage{ +robust_values(num.values, force = FALSE, messages = TRUE) +} +\arguments{ +\item{num.values}{values to convert to numeric} + +\item{force}{logical; if \code{TRUE}, returns a vector of values where values that cannot be interpreted as numeric are +set to \code{NA}; if \code{FALSE}, returns the original vector and gives a warning if any value cannot be interpreted as +numeric.} + +\item{messages}{logical; if \code{TRUE}, returns a message of what was done with the \code{num.values}} +} +\description{ +Brute force solution for ensuring a variable is numeric by +coercing a variable of any type first to factor and then to numeric +} +\note{ +Returns \code{NULL} if given \code{num.values} is \code{NULL}. +} +\examples{ +## this works +values <- c("1", "3", "5") +values <- robust_values(values) + +## this works +values <- c("1", "3", "5", NA) +values <- robust_values(values) + +## this returns originals +values <- c("1", "3", "5", "a") +values <- robust_values(values) + +## this forces "a" to NA and works otherwise +values <- c("1", "3", "5", "a") +values <- robust_values(values, force=TRUE) + +} +\author{ +Joonas Miettinen +} diff --git a/man/rpcurve.Rd b/man/rpcurve.Rd index 101a194..fb0b2d3 100644 --- a/man/rpcurve.Rd +++ b/man/rpcurve.Rd @@ -1,71 +1,71 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/relative_poisson_net_survival.R -\name{rpcurve} -\alias{rpcurve} -\title{Marginal piecewise parametric relative survival curve} -\usage{ -rpcurve(object = NULL) -} -\arguments{ -\item{object}{a \code{relpois} object} -} -\description{ -Fit a marginal relative survival curve based on a \code{relpois} fit -} -\details{ -\pkg{popEpi} version 0.2.1 supported confidence intervals but due to lack -of testing this is disabled until the intervals are subjected to more rigorous testing. - -Currently only estimates a marginal curve, i.e. the average of all -possible individual curves. - -Only supported when the reserved \code{FOT} variable was used in \code{relpois}. -Computes a curve for each unique combination of covariates (e.g. 4 sets) -and returns a weighted average curve based on the counts -of subjects for each combination (e.g. 1000, 125, 50, 25 respectively). -Fairly fast when only factor variables have been used, otherwise -go get a cup of coffee. - -If delayed entry is present in data due to period analysis limiting, -the marginal curve is constructed only for those whose follow-up started -in the respective period. -} -\examples{ -\dontrun{ -## use the simulated rectal cancer cohort -data("sire", package = "popEpi") -ab <- c(0,45,55,65,70,Inf) -sire$agegr <- cut(sire$dg_age, breaks = ab, right = FALSE) - -BL <- list(fot= seq(0,10,1/12)) -pm <- data.frame(popEpi::popmort) -x <- lexpand(sire, breaks=BL, pophaz=pm, - birth = bi_date, - entry = dg_date, exit = ex_date, - status = status \%in\% 1:2) - -rpm <- relpois(x, formula = lex.Xst \%in\% 1:2 ~ -1+ FOT + agegr, - fot.breaks=c(0,0.25,0.5,1:8,10)) -pmc <- rpcurve(rpm) - -## compare with non-parametric estimates -names(pm) <- c("sex", "per", "age", "haz") -x$agegr <- cut(x$dg_age, c(0,45,55,65,75,Inf), right = FALSE) -st <- survtab(fot ~ adjust(agegr), data = x, weights = "internal", - pophaz = pm) - - -plot(st, y = "r.e2.as") -lines(y = pmc$est, x = pmc$Tstop, col="red") -} - - - -} -\seealso{ -Other relpois functions: \code{\link{RPL}}, - \code{\link{relpois_ag}}, \code{\link{relpois}} -} -\author{ -Joonas Miettinen -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/relative_poisson_net_survival.R +\name{rpcurve} +\alias{rpcurve} +\title{Marginal piecewise parametric relative survival curve} +\usage{ +rpcurve(object = NULL) +} +\arguments{ +\item{object}{a \code{relpois} object} +} +\description{ +Fit a marginal relative survival curve based on a \code{relpois} fit +} +\details{ +\pkg{popEpi} version 0.2.1 supported confidence intervals but due to lack +of testing this is disabled until the intervals are subjected to more rigorous testing. + +Currently only estimates a marginal curve, i.e. the average of all +possible individual curves. + +Only supported when the reserved \code{FOT} variable was used in \code{relpois}. +Computes a curve for each unique combination of covariates (e.g. 4 sets) +and returns a weighted average curve based on the counts +of subjects for each combination (e.g. 1000, 125, 50, 25 respectively). +Fairly fast when only factor variables have been used, otherwise +go get a cup of coffee. + +If delayed entry is present in data due to period analysis limiting, +the marginal curve is constructed only for those whose follow-up started +in the respective period. +} +\examples{ +\dontrun{ +## use the simulated rectal cancer cohort +data("sire", package = "popEpi") +ab <- c(0,45,55,65,70,Inf) +sire$agegr <- cut(sire$dg_age, breaks = ab, right = FALSE) + +BL <- list(fot= seq(0,10,1/12)) +pm <- data.frame(popEpi::popmort) +x <- lexpand(sire, breaks=BL, pophaz=pm, + birth = bi_date, + entry = dg_date, exit = ex_date, + status = status \%in\% 1:2) + +rpm <- relpois(x, formula = lex.Xst \%in\% 1:2 ~ -1+ FOT + agegr, + fot.breaks=c(0,0.25,0.5,1:8,10)) +pmc <- rpcurve(rpm) + +## compare with non-parametric estimates +names(pm) <- c("sex", "per", "age", "haz") +x$agegr <- cut(x$dg_age, c(0,45,55,65,75,Inf), right = FALSE) +st <- survtab(fot ~ adjust(agegr), data = x, weights = "internal", + pophaz = pm) + + +plot(st, y = "r.e2.as") +lines(y = pmc$est, x = pmc$Tstop, col="red") +} + + + +} +\seealso{ +Other relpois functions: \code{\link{RPL}}, + \code{\link{relpois_ag}}, \code{\link{relpois}} +} +\author{ +Joonas Miettinen +} diff --git a/man/setaggre.Rd b/man/setaggre.Rd index c75286f..cf49016 100644 --- a/man/setaggre.Rd +++ b/man/setaggre.Rd @@ -1,52 +1,52 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/aggregating.R -\name{setaggre} -\alias{setaggre} -\title{Set \code{aggre} attributes to an object by modifying in place} -\usage{ -setaggre(x, values = NULL, by = NULL, breaks = NULL) -} -\arguments{ -\item{x}{a \code{data.frame} or \code{data.table}} - -\item{values}{a character string vector; the names of value variables} - -\item{by}{a character string vector; the names of variables by which -\code{values} have been tabulated} - -\item{breaks}{a list of breaks, where each element is a breaks vector -as usually passed to e.g. \code{\link{splitLexisDT}}. The list must be -fully named, with the names corresponding to time scales at the aggregate -level in your data. Every unique value in a time scale variable in data must -also exist in the corresponding vector in the breaks list.} -} -\description{ -Coerces an R object to an \code{aggre} object, identifying -the object as one containing aggregated counts, person-years and other -information. \code{setaggre} modifies in place without taking any copies. -Retains all other attributes. -} -\details{ -\code{setaggre} sets \code{x} to the \code{aggre} class in place -without taking a copy as e.g. \code{as.data.frame.XXX} functions do; see e.g. -\code{\link[data.table]{setDT}}. -} -\examples{ -df <- data.frame(sex = rep(c("male", "female"), each = 5), - obs = rpois(10, rep(7,5, each=5)), - pyrs = rpois(10, lambda = 10000)) -## without any breaks -setaggre(df, values = c("obs", "pyrs"), by = "sex") -df <- data.frame(df) -df$FUT <- 0:4 -## with breaks list -setaggre(df, values = c("obs", "pyrs"), by = "sex", breaks = list(FUT = 0:5)) -} -\seealso{ -Other aggregation functions: \code{\link{aggre}}, - \code{\link{as.aggre}}, \code{\link{lexpand}}, - \code{\link{summary.aggre}} -} -\author{ -Joonas Miettinen -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/aggregating.R +\name{setaggre} +\alias{setaggre} +\title{Set \code{aggre} attributes to an object by modifying in place} +\usage{ +setaggre(x, values = NULL, by = NULL, breaks = NULL) +} +\arguments{ +\item{x}{a \code{data.frame} or \code{data.table}} + +\item{values}{a character string vector; the names of value variables} + +\item{by}{a character string vector; the names of variables by which +\code{values} have been tabulated} + +\item{breaks}{a list of breaks, where each element is a breaks vector +as usually passed to e.g. \code{\link{splitLexisDT}}. The list must be +fully named, with the names corresponding to time scales at the aggregate +level in your data. Every unique value in a time scale variable in data must +also exist in the corresponding vector in the breaks list.} +} +\description{ +Coerces an R object to an \code{aggre} object, identifying +the object as one containing aggregated counts, person-years and other +information. \code{setaggre} modifies in place without taking any copies. +Retains all other attributes. +} +\details{ +\code{setaggre} sets \code{x} to the \code{aggre} class in place +without taking a copy as e.g. \code{as.data.frame.XXX} functions do; see e.g. +\code{\link[data.table]{setDT}}. +} +\examples{ +df <- data.frame(sex = rep(c("male", "female"), each = 5), + obs = rpois(10, rep(7,5, each=5)), + pyrs = rpois(10, lambda = 10000)) +## without any breaks +setaggre(df, values = c("obs", "pyrs"), by = "sex") +df <- data.frame(df) +df$FUT <- 0:4 +## with breaks list +setaggre(df, values = c("obs", "pyrs"), by = "sex", breaks = list(FUT = 0:5)) +} +\seealso{ +Other aggregation functions: \code{\link{aggre}}, + \code{\link{as.aggre}}, \code{\link{lexpand}}, + \code{\link{summary.aggre}} +} +\author{ +Joonas Miettinen +} diff --git a/man/setclass.Rd b/man/setclass.Rd index f8f50c9..94e7767 100644 --- a/man/setclass.Rd +++ b/man/setclass.Rd @@ -1,27 +1,27 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utility_functions.R -\name{setclass} -\alias{setclass} -\title{Set the class of an object (convenience function for - \code{setattr(obj, "class", CLASS)}); can add instead of replace} -\usage{ -setclass(obj, cl, add = FALSE, add.place = "first") -} -\arguments{ -\item{obj}{and object for which to set class} - -\item{cl}{class to set} - -\item{add}{if \code{TRUE}, adds \code{cl} to the -classes of the \code{obj}; otherwise replaces the class information} - -\item{add.place}{\code{"first"} or \code{"last"}; adds \code{cl} -to the front or to the back of the \code{obj}'s class vector} -} -\description{ -Sets the class of an object in place to \code{cl} -by replacing or adding -} -\author{ -Joonas Miettinen -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utility_functions.R +\name{setclass} +\alias{setclass} +\title{Set the class of an object (convenience function for + \code{setattr(obj, "class", CLASS)}); can add instead of replace} +\usage{ +setclass(obj, cl, add = FALSE, add.place = "first") +} +\arguments{ +\item{obj}{and object for which to set class} + +\item{cl}{class to set} + +\item{add}{if \code{TRUE}, adds \code{cl} to the +classes of the \code{obj}; otherwise replaces the class information} + +\item{add.place}{\code{"first"} or \code{"last"}; adds \code{cl} +to the front or to the back of the \code{obj}'s class vector} +} +\description{ +Sets the class of an object in place to \code{cl} +by replacing or adding +} +\author{ +Joonas Miettinen +} diff --git a/man/setcolsnull.Rd b/man/setcolsnull.Rd index 639b4bf..3803278 100644 --- a/man/setcolsnull.Rd +++ b/man/setcolsnull.Rd @@ -1,32 +1,32 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utility_functions.R -\name{setcolsnull} -\alias{setcolsnull} -\title{Delete \code{data.table} columns if there} -\usage{ -setcolsnull(DT = NULL, delete = NULL, keep = NULL, colorder = FALSE, - soft = TRUE) -} -\arguments{ -\item{DT}{a \code{data.table}} - -\item{delete}{a character vector of column names to be deleted} - -\item{keep}{a character vector of column names to keep; -the rest will be removed; \code{keep} overrides \code{delete}} - -\item{colorder}{logical; if \code{TRUE}, also does \code{setcolorder} using -\code{keep}} - -\item{soft}{logical; if \code{TRUE}, does not cause an error if any variable -name in \code{keep} or \code{delete} is missing; \code{soft = FALSE} useful -for programming sometimes} -} -\description{ -Deletes columns in a \code{data.table} conveniently. -May only delete columns that are found silently. Sometimes useful in e.g. -\code{on.exit} expressions. -} -\author{ -Joonas Miettinen -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utility_functions.R +\name{setcolsnull} +\alias{setcolsnull} +\title{Delete \code{data.table} columns if there} +\usage{ +setcolsnull(DT = NULL, delete = NULL, keep = NULL, colorder = FALSE, + soft = TRUE) +} +\arguments{ +\item{DT}{a \code{data.table}} + +\item{delete}{a character vector of column names to be deleted} + +\item{keep}{a character vector of column names to keep; +the rest will be removed; \code{keep} overrides \code{delete}} + +\item{colorder}{logical; if \code{TRUE}, also does \code{setcolorder} using +\code{keep}} + +\item{soft}{logical; if \code{TRUE}, does not cause an error if any variable +name in \code{keep} or \code{delete} is missing; \code{soft = FALSE} useful +for programming sometimes} +} +\description{ +Deletes columns in a \code{data.table} conveniently. +May only delete columns that are found silently. Sometimes useful in e.g. +\code{on.exit} expressions. +} +\author{ +Joonas Miettinen +} diff --git a/man/sibr.Rd b/man/sibr.Rd index 589aea1..deaa8f1 100644 --- a/man/sibr.Rd +++ b/man/sibr.Rd @@ -1,38 +1,38 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/data_document.R -\name{sibr} -\alias{sibr} -\title{sibr - a simulated cohort of Finnish female breast cancer patients} -\format{data.table with columns -\itemize{ - \item sex - gender of the patient (1 = female) - \item bi_date - date of birth - \item dg_date - date of cancer diagnosis - \item ex_date - date of exit from follow-up (death or censoring) - \item status - status of the person at exit; 0 alive; 1 dead due to pertinent cancer; 2 dead due to other causes - \item dg_age - age at diagnosis expressed as fractional years -}} -\source{ -The Finnish Cancer Registry -} -\description{ -\code{sibr} is a simulated cohort pertaining female Finnish breast cancer patients -diagnosed between 1993-2012. Instead of actual original dates, the dates are masked -via modest randomization within several time windows. The dataset is additionally -a random sample of 10 000 cases from the pertaining time window. -} -\details{ -The closing date for the pertinent data was 2012-12-31, meaning status information was -available only up to that point --- hence the maximum possible \code{ex_date} is \code{2012-12-31}. -} -\seealso{ -Other popEpi data: \code{\link{ICSS}}, - \code{\link{meanpop_fi}}, \code{\link{popmort}}, - \code{\link{sire}}, \code{\link{stdpop101}}, - \code{\link{stdpop18}} - -Other survival data: \code{\link{sire}} -} -\author{ -Karri Seppa -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data_document.R +\name{sibr} +\alias{sibr} +\title{sibr - a simulated cohort of Finnish female breast cancer patients} +\format{data.table with columns +\itemize{ + \item sex - gender of the patient (1 = female) + \item bi_date - date of birth + \item dg_date - date of cancer diagnosis + \item ex_date - date of exit from follow-up (death or censoring) + \item status - status of the person at exit; 0 alive; 1 dead due to pertinent cancer; 2 dead due to other causes + \item dg_age - age at diagnosis expressed as fractional years +}} +\source{ +The Finnish Cancer Registry +} +\description{ +\code{sibr} is a simulated cohort pertaining female Finnish breast cancer patients +diagnosed between 1993-2012. Instead of actual original dates, the dates are masked +via modest randomization within several time windows. The dataset is additionally +a random sample of 10 000 cases from the pertaining time window. +} +\details{ +The closing date for the pertinent data was 2012-12-31, meaning status information was +available only up to that point --- hence the maximum possible \code{ex_date} is \code{2012-12-31}. +} +\seealso{ +Other popEpi data: \code{\link{ICSS}}, + \code{\link{meanpop_fi}}, \code{\link{popmort}}, + \code{\link{sire}}, \code{\link{stdpop101}}, + \code{\link{stdpop18}} + +Other survival data: \code{\link{sire}} +} +\author{ +Karri Seppa +} diff --git a/man/sir.Rd b/man/sir.Rd index 3878e83..235ef09 100644 --- a/man/sir.Rd +++ b/man/sir.Rd @@ -1,194 +1,194 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/sir.R -\name{sir} -\alias{sir} -\title{Calculate SIR or SMR} -\usage{ -sir(coh.data, coh.obs, coh.pyrs, ref.data = NULL, ref.obs = NULL, - ref.pyrs = NULL, ref.rate = NULL, subset = NULL, print = NULL, - adjust = NULL, mstate = NULL, test.type = "homogeneity", - conf.type = "profile", conf.level = 0.95, EAR = FALSE) -} -\arguments{ -\item{coh.data}{aggregated cohort data, see e.g. \code{\link{lexpand}}} - -\item{coh.obs}{variable name for observed cases; quoted or unquoted. A vector when using \code{mstata}.} - -\item{coh.pyrs}{variable name for person years in cohort data; quoted or unquoted} - -\item{ref.data}{population data. Can be left NULL if \code{coh.data} is stratified in \code{print}.} - -\item{ref.obs}{variable name for observed cases; quoted or unquoted} - -\item{ref.pyrs}{variable name for person-years in population data; quoted or unquoted} - -\item{ref.rate}{population rate variable (cases/person-years). Overwrites arguments -\code{ref.pyrs} and \code{ref.obs}; quoted or unquoted} - -\item{subset}{logical condition to select data from \code{coh.data} before any computations} - -\item{print}{variable names to stratify results; quoted vector or unquoted named list with functions} - -\item{adjust}{variable names for adjusting without stratifying output; quoted vector or unquoted list} - -\item{mstate}{set column names for cause specific observations; quoted or unquoted. Relevant only -when \code{coh.obs} length is two or more. See details.} - -\item{test.type}{Test for equal SIRs. Test available are 'homogeneity' and 'trend'.} - -\item{conf.type}{Confidence interval type: 'profile'(=default), 'wald' or 'univariate'.} - -\item{conf.level}{Level of type-I error in confidence intervals, default 0.05 is 95\% CI.} - -\item{EAR}{logical; TRUE calculates Excess Absolute Risks for univariate SIRs. -(see details)} -} -\value{ -A sir-object that is a \code{data.table} with meta information in the attributes. -} -\description{ -Poisson modelled standardised incidence or mortality ratios (SIRs / SMRs) i.e. -indirect method for calculating standardised rates. SIR is a ratio of observed and expected cases. -Expected cases are derived by multiplying the strata-specific population rate with the -corresponding person-years of the cohort. -} -\details{ -\code{sir} is a comprehensive tool for modelling SIRs/SMRs with flexible -options to adjust and print SIRs, test homogeneity and utilize -multi-state data. The cohort data and the variable names for observation -counts and person-years are required. -The reference data is optional, since the cohort data -can be stratified (\code{print}) and compared to total. - - -\strong{Adjust and print} - -A SIR can be adjusted or standardised using the covariates found in both \code{coh.data} and \code{ref.data}. -Variable to adjust are given in \code{adjust}. -Variable names needs to match in both \code{coh.data} and \code{ref.data}. -Typical variables to adjust by are gender, age group and calendar period. - -\code{print} is used to stratify the SIR output. In other words, the variables -assigned to \code{print} are the covariates of the Poisson model. -Variable levels are treated as categorical. -Variables can be assigned in both \code{print} and \code{adjust}. -This means the output it adjusted and printed by these variables. - -\code{print} can also be a list of expressions. This enables changing variable -names or transforming variables with functions such as \code{cut} and \code{round}. -For example, the existing variables \code{agegroup} and \code{year} could be -transformed to new levels using \code{cut} by - -\code{print = list( age.category = cut(agegroup, breaks = c(0,50,75,100)), -year.cat = cut(year, seq(1950,2010,20)))} - - -\strong{ref.rate or ref.obs & ref.pyrs} - -The population rate variable can be given to the \code{ref.rate} parameter. -That is, when using e.g. the \code{popmort} or a comparable data file, one may -supply \code{ref.rate} instead of \code{ref.obs} and \code{ref.pyrs}, which -will be ignored if \code{ref.rate} is supplied. - - -Note that if all the stratifying variables in -\code{ref.data} are not listed in \code{adjust}, -or when the categories are otherwise combined, -the (unweighted) mean of rates is used for computing expected cases. -This might incur a small bias in comparison to when exact numbers of observations -and person-years are available. - - - -\strong{mstate} - -E.g. using \code{lexpand} it's possible to compute counts for several outcomes -so that the population at risk is same for each -outcome such as a certain kind of cancer. -The transition counts are in wide data format, -and the relevant columns can be supplied to \code{sir} -in a vector via the \code{coh.obs} argument. -The name of the corresponding new column in \code{ref.data} is given in -\code{mstate}. It's recommended to include the \code{mstate} variable in \code{adjust}, -so the corresponding information should also be available in \code{ref.data}. -More examples in sir-vignette. - -This approach is analogous to where SIRs are calculated separately their -own function calls. - - -\strong{Other parameters} - -\code{univariate} confidence intervals are calculated using exact -Poisson intervals (\code{poisson.ci}). The options \code{profile} and \code{wald} are -is based on a Poisson regression model: profile-likelihood confidence intervals -or Wald's normal-approximation. P-value is Poisson model based \code{conf.type} -or calculated using the method described by Breslow and Day. Function automatically -switches to another \code{conf.type} if calculation is not possible with a message. -Usually model fit fails if there is print stratum with zero expected values. - - -The LRT p-value tests the levels of \code{print}. The test can be either -\code{"homogeneity"}, a likelihood ratio test where the model variables defined in -\code{print} (factor) is compared to the constant model. -Option \code{"trend"} tests if the linear trend of the continuous variable in -\code{print} is significant (using model comparison). - - -\strong{EAR: Excess Absolute Risk} - -Excess Absolute Risk is a simple way to quantify the absolute difference between cohort risk and -population risk. -Make sure that the person-years are calculated accordingly before using EAR. (when using mstate) - -Formula for EAR: -\deqn{EAR = \frac{observed - expected}{person years} \times 1000.}{EAR = (obs - exp)/pyrs * 1000.} - -\strong{Data format} - -The data should be given in tabulated format. That is the number of observations -and person-years are represented for each stratum. -Note that also individual data is allowed as long as each observations, -person-years, and print and adjust variables are presented in columns. -The extra variables and levels are reduced automatically before estimating SIRs. -Example of data format: - -\tabular{rrrrr}{ - sex \tab age \tab period \tab obs \tab pyrs \cr - 0 \tab 1 \tab 2010 \tab 0 \tab 390 \cr - 0 \tab 2 \tab 2010 \tab 5 \tab 385 \cr - 1 \tab 1 \tab 2010 \tab 3 \tab 308 \cr - 1 \tab 2 \tab 2010 \tab 12 \tab 315 -} -} -\examples{ -data(popmort) -data(sire) -c <- lexpand( sire, status = status, birth = bi_date, exit = ex_date, entry = dg_date, - breaks = list(per = 1950:2013, age = 1:100, fot = c(0,10,20,Inf)), - aggre = list(fot, agegroup = age, year = per, sex) ) -## SMR due other causes: status = 2 -se <- sir( coh.data = c, coh.obs = 'from0to2', coh.pyrs = 'pyrs', - ref.data = popmort, ref.rate = 'haz', - adjust = c('agegroup', 'year', 'sex'), print = 'fot') -se -## for examples see: vignette('sir') - - -} -\seealso{ -\code{\link{lexpand}} -\href{../doc/sir.html}{A SIR calculation vignette} - -Other sir functions: \code{\link{lines.sirspline}}, - \code{\link{plot.sirspline}}, \code{\link{sir_exp}}, - \code{\link{sir_ratio}}, \code{\link{sirspline}} - -Other main functions: \code{\link{rate}}, - \code{\link{relpois_ag}}, \code{\link{relpois}}, - \code{\link{sirspline}}, \code{\link{survmean}}, - \code{\link{survtab_ag}}, \code{\link{survtab}} -} -\author{ -Matti Rantanen, Joonas Miettinen -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/sir.R +\name{sir} +\alias{sir} +\title{Calculate SIR or SMR} +\usage{ +sir(coh.data, coh.obs, coh.pyrs, ref.data = NULL, ref.obs = NULL, + ref.pyrs = NULL, ref.rate = NULL, subset = NULL, print = NULL, + adjust = NULL, mstate = NULL, test.type = "homogeneity", + conf.type = "profile", conf.level = 0.95, EAR = FALSE) +} +\arguments{ +\item{coh.data}{aggregated cohort data, see e.g. \code{\link{lexpand}}} + +\item{coh.obs}{variable name for observed cases; quoted or unquoted. A vector when using \code{mstata}.} + +\item{coh.pyrs}{variable name for person years in cohort data; quoted or unquoted} + +\item{ref.data}{population data. Can be left NULL if \code{coh.data} is stratified in \code{print}.} + +\item{ref.obs}{variable name for observed cases; quoted or unquoted} + +\item{ref.pyrs}{variable name for person-years in population data; quoted or unquoted} + +\item{ref.rate}{population rate variable (cases/person-years). Overwrites arguments +\code{ref.pyrs} and \code{ref.obs}; quoted or unquoted} + +\item{subset}{logical condition to select data from \code{coh.data} before any computations} + +\item{print}{variable names to stratify results; quoted vector or unquoted named list with functions} + +\item{adjust}{variable names for adjusting without stratifying output; quoted vector or unquoted list} + +\item{mstate}{set column names for cause specific observations; quoted or unquoted. Relevant only +when \code{coh.obs} length is two or more. See details.} + +\item{test.type}{Test for equal SIRs. Test available are 'homogeneity' and 'trend'.} + +\item{conf.type}{Confidence interval type: 'profile'(=default), 'wald' or 'univariate'.} + +\item{conf.level}{Level of type-I error in confidence intervals, default 0.05 is 95\% CI.} + +\item{EAR}{logical; TRUE calculates Excess Absolute Risks for univariate SIRs. +(see details)} +} +\value{ +A sir-object that is a \code{data.table} with meta information in the attributes. +} +\description{ +Poisson modelled standardised incidence or mortality ratios (SIRs / SMRs) i.e. +indirect method for calculating standardised rates. SIR is a ratio of observed and expected cases. +Expected cases are derived by multiplying the strata-specific population rate with the +corresponding person-years of the cohort. +} +\details{ +\code{sir} is a comprehensive tool for modelling SIRs/SMRs with flexible +options to adjust and print SIRs, test homogeneity and utilize +multi-state data. The cohort data and the variable names for observation +counts and person-years are required. +The reference data is optional, since the cohort data +can be stratified (\code{print}) and compared to total. + + +\strong{Adjust and print} + +A SIR can be adjusted or standardised using the covariates found in both \code{coh.data} and \code{ref.data}. +Variable to adjust are given in \code{adjust}. +Variable names needs to match in both \code{coh.data} and \code{ref.data}. +Typical variables to adjust by are gender, age group and calendar period. + +\code{print} is used to stratify the SIR output. In other words, the variables +assigned to \code{print} are the covariates of the Poisson model. +Variable levels are treated as categorical. +Variables can be assigned in both \code{print} and \code{adjust}. +This means the output it adjusted and printed by these variables. + +\code{print} can also be a list of expressions. This enables changing variable +names or transforming variables with functions such as \code{cut} and \code{round}. +For example, the existing variables \code{agegroup} and \code{year} could be +transformed to new levels using \code{cut} by + +\code{print = list( age.category = cut(agegroup, breaks = c(0,50,75,100)), +year.cat = cut(year, seq(1950,2010,20)))} + + +\strong{ref.rate or ref.obs & ref.pyrs} + +The population rate variable can be given to the \code{ref.rate} parameter. +That is, when using e.g. the \code{popmort} or a comparable data file, one may +supply \code{ref.rate} instead of \code{ref.obs} and \code{ref.pyrs}, which +will be ignored if \code{ref.rate} is supplied. + + +Note that if all the stratifying variables in +\code{ref.data} are not listed in \code{adjust}, +or when the categories are otherwise combined, +the (unweighted) mean of rates is used for computing expected cases. +This might incur a small bias in comparison to when exact numbers of observations +and person-years are available. + + + +\strong{mstate} + +E.g. using \code{lexpand} it's possible to compute counts for several outcomes +so that the population at risk is same for each +outcome such as a certain kind of cancer. +The transition counts are in wide data format, +and the relevant columns can be supplied to \code{sir} +in a vector via the \code{coh.obs} argument. +The name of the corresponding new column in \code{ref.data} is given in +\code{mstate}. It's recommended to include the \code{mstate} variable in \code{adjust}, +so the corresponding information should also be available in \code{ref.data}. +More examples in sir-vignette. + +This approach is analogous to where SIRs are calculated separately their +own function calls. + + +\strong{Other parameters} + +\code{univariate} confidence intervals are calculated using exact +Poisson intervals (\code{poisson.ci}). The options \code{profile} and \code{wald} are +is based on a Poisson regression model: profile-likelihood confidence intervals +or Wald's normal-approximation. P-value is Poisson model based \code{conf.type} +or calculated using the method described by Breslow and Day. Function automatically +switches to another \code{conf.type} if calculation is not possible with a message. +Usually model fit fails if there is print stratum with zero expected values. + + +The LRT p-value tests the levels of \code{print}. The test can be either +\code{"homogeneity"}, a likelihood ratio test where the model variables defined in +\code{print} (factor) is compared to the constant model. +Option \code{"trend"} tests if the linear trend of the continuous variable in +\code{print} is significant (using model comparison). + + +\strong{EAR: Excess Absolute Risk} + +Excess Absolute Risk is a simple way to quantify the absolute difference between cohort risk and +population risk. +Make sure that the person-years are calculated accordingly before using EAR. (when using mstate) + +Formula for EAR: +\deqn{EAR = \frac{observed - expected}{person years} \times 1000.}{EAR = (obs - exp)/pyrs * 1000.} + +\strong{Data format} + +The data should be given in tabulated format. That is the number of observations +and person-years are represented for each stratum. +Note that also individual data is allowed as long as each observations, +person-years, and print and adjust variables are presented in columns. +The extra variables and levels are reduced automatically before estimating SIRs. +Example of data format: + +\tabular{rrrrr}{ + sex \tab age \tab period \tab obs \tab pyrs \cr + 0 \tab 1 \tab 2010 \tab 0 \tab 390 \cr + 0 \tab 2 \tab 2010 \tab 5 \tab 385 \cr + 1 \tab 1 \tab 2010 \tab 3 \tab 308 \cr + 1 \tab 2 \tab 2010 \tab 12 \tab 315 +} +} +\examples{ +data(popmort) +data(sire) +c <- lexpand( sire, status = status, birth = bi_date, exit = ex_date, entry = dg_date, + breaks = list(per = 1950:2013, age = 1:100, fot = c(0,10,20,Inf)), + aggre = list(fot, agegroup = age, year = per, sex) ) +## SMR due other causes: status = 2 +se <- sir( coh.data = c, coh.obs = 'from0to2', coh.pyrs = 'pyrs', + ref.data = popmort, ref.rate = 'haz', + adjust = c('agegroup', 'year', 'sex'), print = 'fot') +se +## for examples see: vignette('sir') + + +} +\seealso{ +\code{\link{lexpand}} +\href{../doc/sir.html}{A SIR calculation vignette} + +Other sir functions: \code{\link{lines.sirspline}}, + \code{\link{plot.sirspline}}, \code{\link{sir_exp}}, + \code{\link{sir_ratio}}, \code{\link{sirspline}} + +Other main functions: \code{\link{rate}}, + \code{\link{relpois_ag}}, \code{\link{relpois}}, + \code{\link{sirspline}}, \code{\link{survmean}}, + \code{\link{survtab_ag}}, \code{\link{survtab}} +} +\author{ +Matti Rantanen, Joonas Miettinen +} diff --git a/man/sir_exp.Rd b/man/sir_exp.Rd index 4f7d62a..59c68ae 100644 --- a/man/sir_exp.Rd +++ b/man/sir_exp.Rd @@ -1,102 +1,102 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/sir.R -\name{sir_exp} -\alias{sir_exp} -\alias{sir_lex} -\alias{sir_ag} -\title{Calculate SMR} -\usage{ -sir_exp(x, obs, exp, pyrs = NULL, print = NULL, conf.type = "profile", - test.type = "homogeneity", conf.level = 0.95, subset = NULL) - -sir_lex(x, print = NULL, breaks = NULL, ...) - -sir_ag(x, obs = "from0to1", print = attr(x, "aggre.meta")$by, - exp = "d.exp", pyrs = "pyrs", ...) -} -\arguments{ -\item{x}{Data set e.g. \code{aggre} or \code{Lexis} object -(see: \code{\link{lexpand}})} - -\item{obs}{Variable name of the observed cases in the data set} - -\item{exp}{Variable name or expression for expected cases} - -\item{pyrs}{Variable name for person-years (optional)} - -\item{print}{Variables or expression to stratify the results} - -\item{conf.type}{select confidence interval type: (default=) `profile`, `wald`, `univariate`} - -\item{test.type}{Test for equal SIRs. Test available are 'homogeneity' and 'trend'} - -\item{conf.level}{Level of type-I error in confidence intervals, default 0.05 is 95\% CI} - -\item{subset}{a logical vector for subsetting data} - -\item{breaks}{a named list to split age group (age), period (per) or follow-up (fot).} - -\item{...}{pass arguments to \code{sir_exp}} -} -\value{ -A sir object -} -\description{ -Calculate Standardized Mortality Ratios (SMRs) using -a single data set that includes -observed and expected cases and additionally person-years. - -\code{sir_lex} solves SMR from a \code{\link{Lexis}} object -calculated with \code{lexpand}. - -\code{sir_ag} solves SMR from a \code{\link{aggre}} object -calculated using \code{\link{lexpand}}. -} -\details{ -These functions are intended to calculate SMRs from a single data set -that includes both observed and expected number of cases. For example utilizing the -argument \code{pop.haz} of the \code{\link{lexpand}}. - -\code{sir_lex} automatically exports the transition \code{fromXtoY} using the first -state in \code{lex.Str} as \code{0} and all other as \code{1}. No missing values -is allowed in observed, pop.haz or person-years. -} -\section{Functions}{ -\itemize{ -\item \code{sir_lex}: - -\item \code{sir_ag}: -}} - -\examples{ - -\dontrun{ -BL <- list(fot = 0:5, per = c("2003-01-01","2008-01-01", "2013-01-01")) - -## Aggregated data -x1 <- lexpand(sire, breaks = BL, status = status != 0, - birth = bi_date, entry = dg_date, exit = ex_date, - pophaz=popmort, - aggre=list(sex, period = per, surv.int = fot)) -sir_ag(x1, print = 'period') - - -# no aggreate or breaks -x2 <- lexpand(sire, status = status != 0, - birth = bi_date, entry = dg_date, exit = ex_date, - pophaz=popmort) -sir_lex(x2, breaks = BL, print = 'per') -} - -} -\seealso{ -\code{\link{lexpand}} -\href{../doc/sir.html}{A SIR calculation vignette} - -Other sir functions: \code{\link{lines.sirspline}}, - \code{\link{plot.sirspline}}, \code{\link{sir_ratio}}, - \code{\link{sirspline}}, \code{\link{sir}} -} -\author{ -Matti Rantanen -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/sir.R +\name{sir_exp} +\alias{sir_exp} +\alias{sir_lex} +\alias{sir_ag} +\title{Calculate SMR} +\usage{ +sir_exp(x, obs, exp, pyrs = NULL, print = NULL, conf.type = "profile", + test.type = "homogeneity", conf.level = 0.95, subset = NULL) + +sir_lex(x, print = NULL, breaks = NULL, ...) + +sir_ag(x, obs = "from0to1", print = attr(x, "aggre.meta")$by, + exp = "d.exp", pyrs = "pyrs", ...) +} +\arguments{ +\item{x}{Data set e.g. \code{aggre} or \code{Lexis} object +(see: \code{\link{lexpand}})} + +\item{obs}{Variable name of the observed cases in the data set} + +\item{exp}{Variable name or expression for expected cases} + +\item{pyrs}{Variable name for person-years (optional)} + +\item{print}{Variables or expression to stratify the results} + +\item{conf.type}{select confidence interval type: (default=) `profile`, `wald`, `univariate`} + +\item{test.type}{Test for equal SIRs. Test available are 'homogeneity' and 'trend'} + +\item{conf.level}{Level of type-I error in confidence intervals, default 0.05 is 95\% CI} + +\item{subset}{a logical vector for subsetting data} + +\item{breaks}{a named list to split age group (age), period (per) or follow-up (fot).} + +\item{...}{pass arguments to \code{sir_exp}} +} +\value{ +A sir object +} +\description{ +Calculate Standardized Mortality Ratios (SMRs) using +a single data set that includes +observed and expected cases and additionally person-years. + +\code{sir_lex} solves SMR from a \code{\link{Lexis}} object +calculated with \code{lexpand}. + +\code{sir_ag} solves SMR from a \code{\link{aggre}} object +calculated using \code{\link{lexpand}}. +} +\details{ +These functions are intended to calculate SMRs from a single data set +that includes both observed and expected number of cases. For example utilizing the +argument \code{pop.haz} of the \code{\link{lexpand}}. + +\code{sir_lex} automatically exports the transition \code{fromXtoY} using the first +state in \code{lex.Str} as \code{0} and all other as \code{1}. No missing values +is allowed in observed, pop.haz or person-years. +} +\section{Functions}{ +\itemize{ +\item \code{sir_lex}: + +\item \code{sir_ag}: +}} + +\examples{ + +\dontrun{ +BL <- list(fot = 0:5, per = c("2003-01-01","2008-01-01", "2013-01-01")) + +## Aggregated data +x1 <- lexpand(sire, breaks = BL, status = status != 0, + birth = bi_date, entry = dg_date, exit = ex_date, + pophaz=popmort, + aggre=list(sex, period = per, surv.int = fot)) +sir_ag(x1, print = 'period') + + +# no aggreate or breaks +x2 <- lexpand(sire, status = status != 0, + birth = bi_date, entry = dg_date, exit = ex_date, + pophaz=popmort) +sir_lex(x2, breaks = BL, print = 'per') +} + +} +\seealso{ +\code{\link{lexpand}} +\href{../doc/sir.html}{A SIR calculation vignette} + +Other sir functions: \code{\link{lines.sirspline}}, + \code{\link{plot.sirspline}}, \code{\link{sir_ratio}}, + \code{\link{sirspline}}, \code{\link{sir}} +} +\author{ +Matti Rantanen +} diff --git a/man/sir_ratio.Rd b/man/sir_ratio.Rd index 209dcde..ef38cbe 100644 --- a/man/sir_ratio.Rd +++ b/man/sir_ratio.Rd @@ -1,81 +1,81 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/sir_utils.R -\name{sir_ratio} -\alias{sir_ratio} -\title{Confidence intervals for the ratio of two SIRs/SMRs} -\usage{ -sir_ratio(x, y, digits = 3, alternative = "two.sided", conf.level = 0.95, - type = "exact") -} -\arguments{ -\item{x}{a sir-object or a vector of two; observed and expected cases.} - -\item{y}{a sir-object or a vector of two; observed and expected cases.} - -\item{digits}{number of digits in the output} - -\item{alternative}{The null-hypothesis test: (default:) \code{two.sided}, \code{less}, \code{greater}} - -\item{conf.level}{the type-I error in confidence intervals, default 0.95 for 95\% CI.} - -\item{type}{How the binomial confidence intervals are calculated (default:) \code{exact} or \code{asymptotic}.} -} -\value{ -A vector length of three: sir_ratio, and lower and upper confidence intervals. -} -\description{ -Calculate ratio of two SIRs/SMRs and the confidence intervals of the ratio. -} -\details{ -Function works with pooled sir-objects i.e. the \code{print} argument in \code{sir} is ignored. -Also \code{x} and \code{y} can be a vector of two where first index is the -observed cases and second is expected cases (see examples). -Note that the ratio of two SIR's is only applicable when the age distributions are similar -in both populations. - -\strong{Formula} - -The observed number of first sir \code{O1} is considered as a Binomial variable with sample -size of \code{O1+O2}. The confidence intervals for Binomial proportion \code{A} -is solved using \code{exact} or \code{asymptotic} -method. Now the CI for ratio \code{O1/O2} is \code{B = A/(1 - A)}. And further the CI for SIR/SMR -is B*E2/E1. (Ederer and Mantel) -} -\note{ -Parameter \code{alternative} is always \code{two.sided} when parameter -\code{type} is set to \code{asymptotic}. -} -\examples{ -## Ratio for sir-object and the same values given manually: - - -## create example dataset -dt1 <- data.frame(obs = rep(c(5,7), 10), - pyrs = rep(c(250,300,350,400), 5), - var = 1:20) -Ref <- data.frame(obs = rep(c(50,70,80,100), 5), - pyrs = rep(c(2500,3000,3500,4000), 5), - var = 1:20) -## sir using the function -s1 <- sir(coh.data = dt1, coh.obs = obs, coh.pyrs = pyrs, - ref.data = Ref, ref.obs = obs, ref.pyrs = pyrs, - adjust = var) - -## Ratio is simply 1: -sir_ratio(s1, c(120, 150)) - -} -\references{ -Statistics with Confidence: Confidence Intervals and Statistical Guidelines, Douglas Altman -} -\seealso{ -\code{\link{sir}} -\href{../doc/sir.html}{A SIR calculation vignette} - -Other sir functions: \code{\link{lines.sirspline}}, - \code{\link{plot.sirspline}}, \code{\link{sir_exp}}, - \code{\link{sirspline}}, \code{\link{sir}} -} -\author{ -Matti Rantanen -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/sir_utils.R +\name{sir_ratio} +\alias{sir_ratio} +\title{Confidence intervals for the ratio of two SIRs/SMRs} +\usage{ +sir_ratio(x, y, digits = 3, alternative = "two.sided", conf.level = 0.95, + type = "exact") +} +\arguments{ +\item{x}{a sir-object or a vector of two; observed and expected cases.} + +\item{y}{a sir-object or a vector of two; observed and expected cases.} + +\item{digits}{number of digits in the output} + +\item{alternative}{The null-hypothesis test: (default:) \code{two.sided}, \code{less}, \code{greater}} + +\item{conf.level}{the type-I error in confidence intervals, default 0.95 for 95\% CI.} + +\item{type}{How the binomial confidence intervals are calculated (default:) \code{exact} or \code{asymptotic}.} +} +\value{ +A vector length of three: sir_ratio, and lower and upper confidence intervals. +} +\description{ +Calculate ratio of two SIRs/SMRs and the confidence intervals of the ratio. +} +\details{ +Function works with pooled sir-objects i.e. the \code{print} argument in \code{sir} is ignored. +Also \code{x} and \code{y} can be a vector of two where first index is the +observed cases and second is expected cases (see examples). +Note that the ratio of two SIR's is only applicable when the age distributions are similar +in both populations. + +\strong{Formula} + +The observed number of first sir \code{O1} is considered as a Binomial variable with sample +size of \code{O1+O2}. The confidence intervals for Binomial proportion \code{A} +is solved using \code{exact} or \code{asymptotic} +method. Now the CI for ratio \code{O1/O2} is \code{B = A/(1 - A)}. And further the CI for SIR/SMR +is B*E2/E1. (Ederer and Mantel) +} +\note{ +Parameter \code{alternative} is always \code{two.sided} when parameter +\code{type} is set to \code{asymptotic}. +} +\examples{ +## Ratio for sir-object and the same values given manually: + + +## create example dataset +dt1 <- data.frame(obs = rep(c(5,7), 10), + pyrs = rep(c(250,300,350,400), 5), + var = 1:20) +Ref <- data.frame(obs = rep(c(50,70,80,100), 5), + pyrs = rep(c(2500,3000,3500,4000), 5), + var = 1:20) +## sir using the function +s1 <- sir(coh.data = dt1, coh.obs = obs, coh.pyrs = pyrs, + ref.data = Ref, ref.obs = obs, ref.pyrs = pyrs, + adjust = var) + +## Ratio is simply 1: +sir_ratio(s1, c(120, 150)) + +} +\references{ +Statistics with Confidence: Confidence Intervals and Statistical Guidelines, Douglas Altman +} +\seealso{ +\code{\link{sir}} +\href{../doc/sir.html}{A SIR calculation vignette} + +Other sir functions: \code{\link{lines.sirspline}}, + \code{\link{plot.sirspline}}, \code{\link{sir_exp}}, + \code{\link{sirspline}}, \code{\link{sir}} +} +\author{ +Matti Rantanen +} diff --git a/man/sire.Rd b/man/sire.Rd index e663b70..2c33c58 100644 --- a/man/sire.Rd +++ b/man/sire.Rd @@ -1,37 +1,37 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/data_document.R -\name{sire} -\alias{sire} -\title{sire - a simulated cohort of Finnish female rectal cancer patients} -\format{data.table with columns -\itemize{ - \item sex - gender of the patient (1 = female) - \item bi_date - date of birth - \item dg_date - date of cancer diagnosis - \item ex_date - date of exit from follow-up (death or censoring) - \item status - status of the person at exit; 0 alive; 1 dead due to pertinent cancer; 2 dead due to other causes - \item dg_age - age at diagnosis expressed as fractional years -}} -\source{ -The Finnish Cancer Registry -} -\description{ -\code{sire} is a simulated cohort pertaining female Finnish rectal cancer patients -diagnosed between 1993-2012. Instead of actual original dates, the dates are masked -via modest randomization within several time windows. -} -\details{ -The closing date for the pertinent data was 2012-12-31, meaning status information was -available only up to that point --- hence the maximum possible \code{ex_date} is \code{2012-12-31}. -} -\seealso{ -Other popEpi data: \code{\link{ICSS}}, - \code{\link{meanpop_fi}}, \code{\link{popmort}}, - \code{\link{sibr}}, \code{\link{stdpop101}}, - \code{\link{stdpop18}} - -Other survival data: \code{\link{sibr}} -} -\author{ -Karri Seppa -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data_document.R +\name{sire} +\alias{sire} +\title{sire - a simulated cohort of Finnish female rectal cancer patients} +\format{data.table with columns +\itemize{ + \item sex - gender of the patient (1 = female) + \item bi_date - date of birth + \item dg_date - date of cancer diagnosis + \item ex_date - date of exit from follow-up (death or censoring) + \item status - status of the person at exit; 0 alive; 1 dead due to pertinent cancer; 2 dead due to other causes + \item dg_age - age at diagnosis expressed as fractional years +}} +\source{ +The Finnish Cancer Registry +} +\description{ +\code{sire} is a simulated cohort pertaining female Finnish rectal cancer patients +diagnosed between 1993-2012. Instead of actual original dates, the dates are masked +via modest randomization within several time windows. +} +\details{ +The closing date for the pertinent data was 2012-12-31, meaning status information was +available only up to that point --- hence the maximum possible \code{ex_date} is \code{2012-12-31}. +} +\seealso{ +Other popEpi data: \code{\link{ICSS}}, + \code{\link{meanpop_fi}}, \code{\link{popmort}}, + \code{\link{sibr}}, \code{\link{stdpop101}}, + \code{\link{stdpop18}} + +Other survival data: \code{\link{sibr}} +} +\author{ +Karri Seppa +} diff --git a/man/sirspline.Rd b/man/sirspline.Rd index d0b70dc..5943b1f 100644 --- a/man/sirspline.Rd +++ b/man/sirspline.Rd @@ -1,141 +1,141 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/sir.R -\name{sirspline} -\alias{sirspline} -\title{Estimate splines for SIR or SMR} -\usage{ -sirspline(coh.data, coh.obs, coh.pyrs, ref.data = NULL, ref.obs = NULL, - ref.pyrs = NULL, ref.rate = NULL, subset = NULL, print = NULL, - adjust = NULL, mstate = NULL, spline, knots = NULL, - reference.points = NULL, dependent.splines = TRUE) -} -\arguments{ -\item{coh.data}{cohort data with observations and at risk time variables} - -\item{coh.obs}{variable name for observed cases} - -\item{coh.pyrs}{variable name for person-years in cohort data} - -\item{ref.data}{aggregated population data} - -\item{ref.obs}{variable name for observed cases} - -\item{ref.pyrs}{variable name for person-years in population data} - -\item{ref.rate}{population rate observed/expected. This overwrites the parameters -\code{ref.pyrs} and \code{ref.obs}.} - -\item{subset}{logical condition to subset \code{coh.data} before any computations} - -\item{print}{variable names for which to estimate SIRs/SMRs and -associated splines separately} - -\item{adjust}{variable names for adjusting the expected cases} - -\item{mstate}{set column names for cause specific observations. Relevant only -when coh.obs length is two or more. See help for \code{sir}.} - -\item{spline}{variable name(s) for the splines} - -\item{knots}{number knots (vector), pre-defined knots (list of vectors) or for optimal number of knots left NULL} - -\item{reference.points}{fixed reference values for rate ratios. If left \code{NULL} -the smallest value is the reference point (where SIR = 1). -Ignored if \code{dependent.splines = FALSE}} - -\item{dependent.splines}{logical; if TRUE, all splines are fitted in same model.} -} -\value{ -A list of data.frames and vectors. -Three spline estimates are named as \code{spline.est.A/B/C} and the corresponding values -in \code{spline.seq.A/B/C} for manual plotting -} -\description{ -Splines for standardised incidence or mortality ratio. A useful -tool to e.g. check whether a constant SIR can be assumed for all calendar periods, -age groups or follow-up intervals. Splines can be fitted for these time dimensions -separately or in the same model. -} -\details{ -See \code{\link{sir}} for help on SIR/SMR estimation in general; usage of splines -is discussed below. - -\strong{The spline variables} - -The model can include one, two or three splines variables. -Variables can be included in the same model selecting \code{dependent.splines = TRUE} -and SIR ratios are calculated (first one is the SIR, others SIR ratios). -Reference points vector can be set via \code{reference.points} -where first element of the vector is the reference point for first ratio. - -Variable(s) to fit splines are given as a vector in argument \code{spline}. -Order will affect the results. - - -\strong{dependent.splines} - -By default dependent.splines is FALSE and all splines are fitted in separate models. -If TRUE, the first variable in \code{spline} is a function of a SIR and other(s) are ratios. - -\strong{knots} - -There are three options to set knots to splines: - -Set the number of knots for each spline variable with a \strong{vector}. -The knots are automatically placed to the quantiles of observed cases in cohort data. -The first and last knots are always the maximum and minimum values, so knot -value needs to be at least two. - -Predefined knot places can be set with a \strong{list} of vectors. -The vector for each spline in the list specifies the knot places. The lowest -and the largest values are the boundary knots and these should be checked beforehand. - -If \code{knots} is left \strong{NULL}, the model searches the optimal number -of knots by model AIC by fitting models iteratively from 2 to 15 knots and -the one with smallest AIC is selected. -If \code{dependent.splines = TRUE}, the number of knots is searched by fitting each spline -variable separately. - - -\strong{print} - -Splines can be stratified by the levels of variable given in \code{print}. If -\code{print} is a vector, only the first variable is accounted for. The knots -are placed globally for all levels of \code{print}. This also ensures that the likelihood -ratio test is valid. -Splines are also fitted independently for each level of \code{print}. -This allows for searching interactions, e.g. by fitting spline for period -(\code{splines='period'}) for each age group (\code{print = 'agegroup'}). - - -\strong{p-values} - -The output p-value is a test of whether the splines are equal (homogenous) -at different levels of \code{print}. -The test is based on the likelihood ratio test, where the full model -includes \code{print} and is -compared to a null model without it. -When \code{(dependent.splines = TRUE)} the p-value returned is a global p-value. -Otherwise the p-value is spline-specific. -} -\examples{ -\dontrun{ -## for examples see: vignette('sir') -} -} -\seealso{ -\code{\link{splitMulti}} -\href{../doc/sir.html}{A SIR calculation vignette} - -Other sir functions: \code{\link{lines.sirspline}}, - \code{\link{plot.sirspline}}, \code{\link{sir_exp}}, - \code{\link{sir_ratio}}, \code{\link{sir}} - -Other main functions: \code{\link{rate}}, - \code{\link{relpois_ag}}, \code{\link{relpois}}, - \code{\link{sir}}, \code{\link{survmean}}, - \code{\link{survtab_ag}}, \code{\link{survtab}} -} -\author{ -Matti Rantanen, Joonas Miettinen -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/sir.R +\name{sirspline} +\alias{sirspline} +\title{Estimate splines for SIR or SMR} +\usage{ +sirspline(coh.data, coh.obs, coh.pyrs, ref.data = NULL, ref.obs = NULL, + ref.pyrs = NULL, ref.rate = NULL, subset = NULL, print = NULL, + adjust = NULL, mstate = NULL, spline, knots = NULL, + reference.points = NULL, dependent.splines = TRUE) +} +\arguments{ +\item{coh.data}{cohort data with observations and at risk time variables} + +\item{coh.obs}{variable name for observed cases} + +\item{coh.pyrs}{variable name for person-years in cohort data} + +\item{ref.data}{aggregated population data} + +\item{ref.obs}{variable name for observed cases} + +\item{ref.pyrs}{variable name for person-years in population data} + +\item{ref.rate}{population rate observed/expected. This overwrites the parameters +\code{ref.pyrs} and \code{ref.obs}.} + +\item{subset}{logical condition to subset \code{coh.data} before any computations} + +\item{print}{variable names for which to estimate SIRs/SMRs and +associated splines separately} + +\item{adjust}{variable names for adjusting the expected cases} + +\item{mstate}{set column names for cause specific observations. Relevant only +when coh.obs length is two or more. See help for \code{sir}.} + +\item{spline}{variable name(s) for the splines} + +\item{knots}{number knots (vector), pre-defined knots (list of vectors) or for optimal number of knots left NULL} + +\item{reference.points}{fixed reference values for rate ratios. If left \code{NULL} +the smallest value is the reference point (where SIR = 1). +Ignored if \code{dependent.splines = FALSE}} + +\item{dependent.splines}{logical; if TRUE, all splines are fitted in same model.} +} +\value{ +A list of data.frames and vectors. +Three spline estimates are named as \code{spline.est.A/B/C} and the corresponding values +in \code{spline.seq.A/B/C} for manual plotting +} +\description{ +Splines for standardised incidence or mortality ratio. A useful +tool to e.g. check whether a constant SIR can be assumed for all calendar periods, +age groups or follow-up intervals. Splines can be fitted for these time dimensions +separately or in the same model. +} +\details{ +See \code{\link{sir}} for help on SIR/SMR estimation in general; usage of splines +is discussed below. + +\strong{The spline variables} + +The model can include one, two or three splines variables. +Variables can be included in the same model selecting \code{dependent.splines = TRUE} +and SIR ratios are calculated (first one is the SIR, others SIR ratios). +Reference points vector can be set via \code{reference.points} +where first element of the vector is the reference point for first ratio. + +Variable(s) to fit splines are given as a vector in argument \code{spline}. +Order will affect the results. + + +\strong{dependent.splines} + +By default dependent.splines is FALSE and all splines are fitted in separate models. +If TRUE, the first variable in \code{spline} is a function of a SIR and other(s) are ratios. + +\strong{knots} + +There are three options to set knots to splines: + +Set the number of knots for each spline variable with a \strong{vector}. +The knots are automatically placed to the quantiles of observed cases in cohort data. +The first and last knots are always the maximum and minimum values, so knot +value needs to be at least two. + +Predefined knot places can be set with a \strong{list} of vectors. +The vector for each spline in the list specifies the knot places. The lowest +and the largest values are the boundary knots and these should be checked beforehand. + +If \code{knots} is left \strong{NULL}, the model searches the optimal number +of knots by model AIC by fitting models iteratively from 2 to 15 knots and +the one with smallest AIC is selected. +If \code{dependent.splines = TRUE}, the number of knots is searched by fitting each spline +variable separately. + + +\strong{print} + +Splines can be stratified by the levels of variable given in \code{print}. If +\code{print} is a vector, only the first variable is accounted for. The knots +are placed globally for all levels of \code{print}. This also ensures that the likelihood +ratio test is valid. +Splines are also fitted independently for each level of \code{print}. +This allows for searching interactions, e.g. by fitting spline for period +(\code{splines='period'}) for each age group (\code{print = 'agegroup'}). + + +\strong{p-values} + +The output p-value is a test of whether the splines are equal (homogenous) +at different levels of \code{print}. +The test is based on the likelihood ratio test, where the full model +includes \code{print} and is +compared to a null model without it. +When \code{(dependent.splines = TRUE)} the p-value returned is a global p-value. +Otherwise the p-value is spline-specific. +} +\examples{ +\dontrun{ +## for examples see: vignette('sir') +} +} +\seealso{ +\code{\link{splitMulti}} +\href{../doc/sir.html}{A SIR calculation vignette} + +Other sir functions: \code{\link{lines.sirspline}}, + \code{\link{plot.sirspline}}, \code{\link{sir_exp}}, + \code{\link{sir_ratio}}, \code{\link{sir}} + +Other main functions: \code{\link{rate}}, + \code{\link{relpois_ag}}, \code{\link{relpois}}, + \code{\link{sir}}, \code{\link{survmean}}, + \code{\link{survtab_ag}}, \code{\link{survtab}} +} +\author{ +Matti Rantanen, Joonas Miettinen +} diff --git a/man/splitLexisDT.Rd b/man/splitLexisDT.Rd index b45ed6d..4ef7b53 100644 --- a/man/splitLexisDT.Rd +++ b/man/splitLexisDT.Rd @@ -1,88 +1,88 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/splitLexisDT.R -\name{splitLexisDT} -\alias{splitLexisDT} -\title{Split case-level observations} -\usage{ -splitLexisDT(lex, breaks, timeScale, merge = TRUE, drop = TRUE) -} -\arguments{ -\item{lex}{a Lexis object, split or not} - -\item{breaks}{a vector of \code{[a,b)} breaks to split \code{data} by} - -\item{timeScale}{a character string; name of the time scale to split by} - -\item{merge}{logical; if \code{TRUE}, retains all variables -from the original data - i.e. original variables are -repeated for all the rows by original subject} - -\item{drop}{logical; if \code{TRUE}, drops all resulting rows -after expansion that reside outside the time window -defined by the given breaks} -} -\value{ -A \code{data.table} or \code{data.frame} -(depending on \code{options("popEpi.datatable")}; see \code{?popEpi}) -object expanded to accommodate split observations. -} -\description{ -Split a \code{Lexis} object along one time scale -(as \code{\link[Epi]{splitLexis}}) with speed -} -\details{ -\code{splitLexisDT} is in essence a \pkg{data.table} version of -\code{splitLexis} or \code{survSplit} for splitting along a single -time scale. It requires a Lexis object as input, which may have already -been split along some time scale. - -Unlike \code{splitLexis}, \code{splitLexisDT} drops observed time outside -the roof and floor of \code{breaks} by default - with \code{drop = FALSE} -the functions have identical behaviour. - -The \code{Lexis} time scale variables can be of any arbitrary -format, e.g. \code{Date}, -fractional years (see \code{\link[Epi]{cal.yr}}) and \code{\link{get.yrs}}, -or other. However, using \code{date} variables (from package \pkg{date}) -are not recommended, as \code{date} variables are always stored as integers, -whereas \code{Date} variables (see \code{?as.Date}) are typically stored -in double ("numeric") format. This allows for breaking days into fractions -as well, when using e.g. hypothetical years of 365.25 days. -} -\examples{ -library(Epi) -data("sire", package = "popEpi") -x <- Lexis(data=sire[1000:1100, ], - entry = list(fot=0, per=get.yrs(dg_date), age=dg_age), - exit=list(per=get.yrs(ex_date)), exit.status=status) -BL <- list(fot=seq(0, 5, by = 3/12), per=c(2008, 2013)) - -x2 <- splitMulti(x, breaks = BL, drop = FALSE) - -x3 <- splitLexisDT(x, breaks = BL$fot, timeScale = "fot", drop = FALSE) -x3 <- splitLexisDT(x3, breaks = BL$per, timeScale = "per", drop = FALSE) - -x4 <- splitLexis(x, breaks = BL$fot, time.scale = "fot") -x4 <- splitLexis(x4, breaks = BL$per, time.scale = "per") -## all produce identical results - -## using Date variables -x <- Lexis(data=sire[1000:1100, ], - entry = list(fot=0, per=dg_date, age=dg_date-bi_date), - exit=list(per=ex_date), exit.status=status) -BL <- list(fot = 0:5*365.25, per = as.Date(c("2008-01-01", "2013-01-01"))) - -x2 <- splitMulti(x, breaks = BL, drop = FALSE) - -x3 <- splitLexisDT(x, breaks = BL$fot, timeScale = "fot", drop = FALSE) -x3 <- splitLexisDT(x3, breaks = BL$per, timeScale = "per", drop = FALSE) - -## splitLexis may not work when using Dates -} -\seealso{ -Other splitting functions: \code{\link{lexpand}}, - \code{\link{splitMulti}} -} -\author{ -Joonas Miettinen -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/splitLexisDT.R +\name{splitLexisDT} +\alias{splitLexisDT} +\title{Split case-level observations} +\usage{ +splitLexisDT(lex, breaks, timeScale, merge = TRUE, drop = TRUE) +} +\arguments{ +\item{lex}{a Lexis object, split or not} + +\item{breaks}{a vector of \code{[a,b)} breaks to split \code{data} by} + +\item{timeScale}{a character string; name of the time scale to split by} + +\item{merge}{logical; if \code{TRUE}, retains all variables +from the original data - i.e. original variables are +repeated for all the rows by original subject} + +\item{drop}{logical; if \code{TRUE}, drops all resulting rows +after expansion that reside outside the time window +defined by the given breaks} +} +\value{ +A \code{data.table} or \code{data.frame} +(depending on \code{options("popEpi.datatable")}; see \code{?popEpi}) +object expanded to accommodate split observations. +} +\description{ +Split a \code{Lexis} object along one time scale +(as \code{\link[Epi]{splitLexis}}) with speed +} +\details{ +\code{splitLexisDT} is in essence a \pkg{data.table} version of +\code{splitLexis} or \code{survSplit} for splitting along a single +time scale. It requires a Lexis object as input, which may have already +been split along some time scale. + +Unlike \code{splitLexis}, \code{splitLexisDT} drops observed time outside +the roof and floor of \code{breaks} by default - with \code{drop = FALSE} +the functions have identical behaviour. + +The \code{Lexis} time scale variables can be of any arbitrary +format, e.g. \code{Date}, +fractional years (see \code{\link[Epi]{cal.yr}}) and \code{\link{get.yrs}}, +or other. However, using \code{date} variables (from package \pkg{date}) +are not recommended, as \code{date} variables are always stored as integers, +whereas \code{Date} variables (see \code{?as.Date}) are typically stored +in double ("numeric") format. This allows for breaking days into fractions +as well, when using e.g. hypothetical years of 365.25 days. +} +\examples{ +library(Epi) +data("sire", package = "popEpi") +x <- Lexis(data=sire[1000:1100, ], + entry = list(fot=0, per=get.yrs(dg_date), age=dg_age), + exit=list(per=get.yrs(ex_date)), exit.status=status) +BL <- list(fot=seq(0, 5, by = 3/12), per=c(2008, 2013)) + +x2 <- splitMulti(x, breaks = BL, drop = FALSE) + +x3 <- splitLexisDT(x, breaks = BL$fot, timeScale = "fot", drop = FALSE) +x3 <- splitLexisDT(x3, breaks = BL$per, timeScale = "per", drop = FALSE) + +x4 <- splitLexis(x, breaks = BL$fot, time.scale = "fot") +x4 <- splitLexis(x4, breaks = BL$per, time.scale = "per") +## all produce identical results + +## using Date variables +x <- Lexis(data=sire[1000:1100, ], + entry = list(fot=0, per=dg_date, age=dg_date-bi_date), + exit=list(per=ex_date), exit.status=status) +BL <- list(fot = 0:5*365.25, per = as.Date(c("2008-01-01", "2013-01-01"))) + +x2 <- splitMulti(x, breaks = BL, drop = FALSE) + +x3 <- splitLexisDT(x, breaks = BL$fot, timeScale = "fot", drop = FALSE) +x3 <- splitLexisDT(x3, breaks = BL$per, timeScale = "per", drop = FALSE) + +## splitLexis may not work when using Dates +} +\seealso{ +Other splitting functions: \code{\link{lexpand}}, + \code{\link{splitMulti}} +} +\author{ +Joonas Miettinen +} diff --git a/man/splitMulti.Rd b/man/splitMulti.Rd index 2828913..a020f3b 100644 --- a/man/splitMulti.Rd +++ b/man/splitMulti.Rd @@ -1,132 +1,132 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/splitMulti.R -\name{splitMulti} -\alias{splitMulti} -\title{Split case-level observations} -\usage{ -splitMulti(data, breaks = NULL, ..., drop = TRUE, merge = TRUE, - verbose = FALSE) -} -\arguments{ -\item{data}{a Lexis object with event cases as rows} - -\item{breaks}{a list of named numeric vectors of breaks; see Details and Examples} - -\item{...}{alternate way of supplying breaks as named vectors; -e.g. \code{fot = 0:5} instead of \code{breaks = list(fot = 0:5)}; -if \code{breaks} is not \code{NULL}, \code{breaks} is used and any breaks -passed through \code{...} are NOT used} - -\item{drop}{logical; if \code{TRUE}, drops all resulting rows -after expansion that reside outside the time window -defined by the given breaks} - -\item{merge}{logical; if \code{TRUE}, retains all variables -from the original data - i.e. original variables are -repeated for all the rows by original subject} - -\item{verbose}{logical; if \code{TRUE}, the function is chatty -and returns some messages along the way} -} -\value{ -A \code{data.table} or \code{data.frame} -(depending on \code{options("popEpi.datatable")}; see \code{?popEpi}) -object expanded to accommodate split observations. -} -\description{ -Split a \code{Lexis} object along multiple time scales -with speed and ease -} -\details{ -\code{splitMulti} is in essence a \pkg{data.table} version of - \code{splitLexis} or \code{survSplit} for splitting along multiple - time scales. -It requires a Lexis object as input. - -The \code{breaks} must be a list of named vectors of the appropriate type. -The breaks are fully explicit and -left-inclusive and right exclusive, e.g. \code{fot=c(0,5)} -forces the data to only include time between -\code{[0,5)} for each original row (unless \code{drop = FALSE}). -Use \code{Inf} or \code{-Inf} for open-ended intervals, - e.g. \code{per=c(1990,1995,Inf)} creates the intervals - \code{[1990,1995), [1995, Inf)}. - -Instead of specifying \code{breaks}, one may make use of the \code{...} -argument to pass breaks: e.g. - -\code{splitMulti(x, breaks = list(fot = 0:5))} - -is equivalent to - -\code{splitMulti(x, fot = 0:5)}. - -Multiple breaks can be supplied in the same manner. However, if both -\code{breaks} and \code{...} are used, only the breaks in \code{breaks} -are utilized within the function. - -The \code{Lexis} time scale variables can be of any arbitrary -format, e.g. \code{Date}, -fractional years (see \code{\link[Epi]{cal.yr}}) and \code{\link{get.yrs}}, -or other. However, using \code{date} variables (from package \pkg{date}) -are not recommended, as \code{date} variables are always stored as integers, -whereas \code{Date} variables (see \code{?as.Date}) are typically stored -in double ("numeric") format. This allows for breaking days into fractions -as well, when using e.g. hypothetical years of 365.25 days. -} -\examples{ -#### let's prepare data for computing period method survivals -#### in case there are problems with dates, we first -#### convert to fractional years. -\dontrun{ -library(Epi) -data("sire", package = "popEpi") -x <- Lexis(data=sire, entry = list(fot=0, per=get.yrs(dg_date), age=dg_age), - exit=list(per=get.yrs(ex_date)), exit.status=status) -x2 <- splitMulti(x, breaks = list(fot=seq(0, 5, by = 3/12), per=c(2008, 2013))) -# equivalently: -x2 <- splitMulti(x, fot=seq(0, 5, by = 3/12), per=c(2008, 2013)) - -## using dates; note: breaks must be expressed as dates or days! -x <- Lexis(data=sire, entry = list(fot=0, per=dg_date, age=dg_date-bi_date), - exit=list(per=ex_date), exit.status=status) -BL <- list(fot = seq(0, 5, by = 3/12)*365.242199, - per = as.Date(paste0(c(1980:2014),"-01-01")), - age = c(0,45,85,Inf)*365.242199) -x2 <- splitMulti(x, breaks = BL, verbose=TRUE) - - -## multistate example (healty - sick - dead) -sire2 <- data.frame(sire) - -set.seed(1L) -not_sick <- sample.int(nrow(sire2), 6000L, replace = FALSE) -sire2[not_sick, ]$dg_date <- NA -sire2[!is.na(sire2$dg_date) & sire2$status == 0, ]$status <- -1 - -sire2$status[sire2$status==2] <- 1 -sire2$status <- factor(sire2$status, levels = c(0, -1, 1), - labels = c("healthy", "sick", "dead")) - -xm <- Lexis(data=sire2, entry = list(fot=0, per=get.yrs(bi_date), age=0), - exit=list(per=get.yrs(ex_date)), exit.status=status) -xm2 <- cutLexis(xm, cut = get.yrs(xm$dg_date), - timescale = "per", - new.state = "sick") -xm2[xm2$lex.id == 6L, ] - -xm2 <- splitMulti(xm2, breaks = list(fot = seq(0,150,25))) -xm2[xm2$lex.id == 6L, ] -} - -} -\seealso{ -\code{\link[Epi]{splitLexis}}, \code{\link[Epi]{Lexis}}, -\code{\link[survival]{survSplit}} - -Other splitting functions: \code{\link{lexpand}}, - \code{\link{splitLexisDT}} -} -\author{ -Joonas Miettinen -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/splitMulti.R +\name{splitMulti} +\alias{splitMulti} +\title{Split case-level observations} +\usage{ +splitMulti(data, breaks = NULL, ..., drop = TRUE, merge = TRUE, + verbose = FALSE) +} +\arguments{ +\item{data}{a Lexis object with event cases as rows} + +\item{breaks}{a list of named numeric vectors of breaks; see Details and Examples} + +\item{...}{alternate way of supplying breaks as named vectors; +e.g. \code{fot = 0:5} instead of \code{breaks = list(fot = 0:5)}; +if \code{breaks} is not \code{NULL}, \code{breaks} is used and any breaks +passed through \code{...} are NOT used} + +\item{drop}{logical; if \code{TRUE}, drops all resulting rows +after expansion that reside outside the time window +defined by the given breaks} + +\item{merge}{logical; if \code{TRUE}, retains all variables +from the original data - i.e. original variables are +repeated for all the rows by original subject} + +\item{verbose}{logical; if \code{TRUE}, the function is chatty +and returns some messages along the way} +} +\value{ +A \code{data.table} or \code{data.frame} +(depending on \code{options("popEpi.datatable")}; see \code{?popEpi}) +object expanded to accommodate split observations. +} +\description{ +Split a \code{Lexis} object along multiple time scales +with speed and ease +} +\details{ +\code{splitMulti} is in essence a \pkg{data.table} version of + \code{splitLexis} or \code{survSplit} for splitting along multiple + time scales. +It requires a Lexis object as input. + +The \code{breaks} must be a list of named vectors of the appropriate type. +The breaks are fully explicit and +left-inclusive and right exclusive, e.g. \code{fot=c(0,5)} +forces the data to only include time between +\code{[0,5)} for each original row (unless \code{drop = FALSE}). +Use \code{Inf} or \code{-Inf} for open-ended intervals, + e.g. \code{per=c(1990,1995,Inf)} creates the intervals + \code{[1990,1995), [1995, Inf)}. + +Instead of specifying \code{breaks}, one may make use of the \code{...} +argument to pass breaks: e.g. + +\code{splitMulti(x, breaks = list(fot = 0:5))} + +is equivalent to + +\code{splitMulti(x, fot = 0:5)}. + +Multiple breaks can be supplied in the same manner. However, if both +\code{breaks} and \code{...} are used, only the breaks in \code{breaks} +are utilized within the function. + +The \code{Lexis} time scale variables can be of any arbitrary +format, e.g. \code{Date}, +fractional years (see \code{\link[Epi]{cal.yr}}) and \code{\link{get.yrs}}, +or other. However, using \code{date} variables (from package \pkg{date}) +are not recommended, as \code{date} variables are always stored as integers, +whereas \code{Date} variables (see \code{?as.Date}) are typically stored +in double ("numeric") format. This allows for breaking days into fractions +as well, when using e.g. hypothetical years of 365.25 days. +} +\examples{ +#### let's prepare data for computing period method survivals +#### in case there are problems with dates, we first +#### convert to fractional years. +\dontrun{ +library(Epi) +data("sire", package = "popEpi") +x <- Lexis(data=sire, entry = list(fot=0, per=get.yrs(dg_date), age=dg_age), + exit=list(per=get.yrs(ex_date)), exit.status=status) +x2 <- splitMulti(x, breaks = list(fot=seq(0, 5, by = 3/12), per=c(2008, 2013))) +# equivalently: +x2 <- splitMulti(x, fot=seq(0, 5, by = 3/12), per=c(2008, 2013)) + +## using dates; note: breaks must be expressed as dates or days! +x <- Lexis(data=sire, entry = list(fot=0, per=dg_date, age=dg_date-bi_date), + exit=list(per=ex_date), exit.status=status) +BL <- list(fot = seq(0, 5, by = 3/12)*365.242199, + per = as.Date(paste0(c(1980:2014),"-01-01")), + age = c(0,45,85,Inf)*365.242199) +x2 <- splitMulti(x, breaks = BL, verbose=TRUE) + + +## multistate example (healty - sick - dead) +sire2 <- data.frame(sire) + +set.seed(1L) +not_sick <- sample.int(nrow(sire2), 6000L, replace = FALSE) +sire2[not_sick, ]$dg_date <- NA +sire2[!is.na(sire2$dg_date) & sire2$status == 0, ]$status <- -1 + +sire2$status[sire2$status==2] <- 1 +sire2$status <- factor(sire2$status, levels = c(0, -1, 1), + labels = c("healthy", "sick", "dead")) + +xm <- Lexis(data=sire2, entry = list(fot=0, per=get.yrs(bi_date), age=0), + exit=list(per=get.yrs(ex_date)), exit.status=status) +xm2 <- cutLexis(xm, cut = get.yrs(xm$dg_date), + timescale = "per", + new.state = "sick") +xm2[xm2$lex.id == 6L, ] + +xm2 <- splitMulti(xm2, breaks = list(fot = seq(0,150,25))) +xm2[xm2$lex.id == 6L, ] +} + +} +\seealso{ +\code{\link[Epi]{splitLexis}}, \code{\link[Epi]{Lexis}}, +\code{\link[survival]{survSplit}} + +Other splitting functions: \code{\link{lexpand}}, + \code{\link{splitLexisDT}} +} +\author{ +Joonas Miettinen +} diff --git a/man/stdpop101.Rd b/man/stdpop101.Rd index fed691a..ebb97b7 100644 --- a/man/stdpop101.Rd +++ b/man/stdpop101.Rd @@ -1,27 +1,27 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/data_document.R -\name{stdpop101} -\alias{stdpop101} -\title{World standard population by 1 year age groups from 1 to 101. Sums to 100 000.} -\format{data.table with columns -\itemize{ - \item \code{world_std} weight that sums to 100000 (numeric) - \item \code{agegroup} age group from 1 to 101 (numeric) -}} -\source{ -Standard population is from: -\href{http://seer.cancer.gov/stdpopulations/stdpop.singleages.html}{world standard population "101of1"} -} -\description{ -World standard population by 1 year age groups from 1 to 101. Sums to 100 000. -} -\seealso{ -Other popEpi data: \code{\link{ICSS}}, - \code{\link{meanpop_fi}}, \code{\link{popmort}}, - \code{\link{sibr}}, \code{\link{sire}}, - \code{\link{stdpop18}} - -Other weights: \code{\link{ICSS}}, - \code{\link{direct_standardization}}, - \code{\link{stdpop18}} -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data_document.R +\name{stdpop101} +\alias{stdpop101} +\title{World standard population by 1 year age groups from 1 to 101. Sums to 100 000.} +\format{data.table with columns +\itemize{ + \item \code{world_std} weight that sums to 100000 (numeric) + \item \code{agegroup} age group from 1 to 101 (numeric) +}} +\source{ +Standard population is from: +\href{http://seer.cancer.gov/stdpopulations/stdpop.singleages.html}{world standard population "101of1"} +} +\description{ +World standard population by 1 year age groups from 1 to 101. Sums to 100 000. +} +\seealso{ +Other popEpi data: \code{\link{ICSS}}, + \code{\link{meanpop_fi}}, \code{\link{popmort}}, + \code{\link{sibr}}, \code{\link{sire}}, + \code{\link{stdpop18}} + +Other weights: \code{\link{ICSS}}, + \code{\link{direct_standardization}}, + \code{\link{stdpop18}} +} diff --git a/man/stdpop18.Rd b/man/stdpop18.Rd index c0b7316..92a82c0 100644 --- a/man/stdpop18.Rd +++ b/man/stdpop18.Rd @@ -1,29 +1,29 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/data_document.R -\name{stdpop18} -\alias{stdpop18} -\title{Standard populations from 2000: world, Europe and Nordic.} -\format{data.table with columns -\itemize{ - \item \code{agegroup}, age group in 18 categories (character) - \item \code{world}, World 2000 standard population (numeric) - \item \code{europe}, European standard population (numeric) - \item \code{nordic}, Nordic standard population (numeric) -}} -\source{ -Nordcan, 2000 -} -\description{ -World, European, and Nordic standard populations by 18 age categories. -Sums to 100000. -} -\seealso{ -Other popEpi data: \code{\link{ICSS}}, - \code{\link{meanpop_fi}}, \code{\link{popmort}}, - \code{\link{sibr}}, \code{\link{sire}}, - \code{\link{stdpop101}} - -Other weights: \code{\link{ICSS}}, - \code{\link{direct_standardization}}, - \code{\link{stdpop101}} -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data_document.R +\name{stdpop18} +\alias{stdpop18} +\title{Standard populations from 2000: world, Europe and Nordic.} +\format{data.table with columns +\itemize{ + \item \code{agegroup}, age group in 18 categories (character) + \item \code{world}, World 2000 standard population (numeric) + \item \code{europe}, European standard population (numeric) + \item \code{nordic}, Nordic standard population (numeric) +}} +\source{ +Nordcan, 2000 +} +\description{ +World, European, and Nordic standard populations by 18 age categories. +Sums to 100000. +} +\seealso{ +Other popEpi data: \code{\link{ICSS}}, + \code{\link{meanpop_fi}}, \code{\link{popmort}}, + \code{\link{sibr}}, \code{\link{sire}}, + \code{\link{stdpop101}} + +Other weights: \code{\link{ICSS}}, + \code{\link{direct_standardization}}, + \code{\link{stdpop101}} +} diff --git a/man/summary.aggre.Rd b/man/summary.aggre.Rd index 9176f54..42f1f04 100644 --- a/man/summary.aggre.Rd +++ b/man/summary.aggre.Rd @@ -1,32 +1,32 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/S3_definitions.R -\name{summary.aggre} -\alias{summary.aggre} -\title{Summarize an \code{aggre} Object} -\usage{ -\method{summary}{aggre}(object, by = NULL, subset = NULL, ...) -} -\arguments{ -\item{object}{an \code{aggre} object} - -\item{by}{list of columns to summarize by - e.g. \code{list(V1, V2)} -where \code{V1} and \code{V2} are columns in the data.} - -\item{subset}{a logical condition to subset results table by -before summarizing; use this to limit to a certain stratum. E.g. -\code{subset = sex == "male"}} - -\item{...}{unused} -} -\description{ -\code{summary} method function for \code{aggre} objects; see -\code{\link{as.aggre}} and \code{\link{aggre}}. -} -\seealso{ -Other aggregation functions: \code{\link{aggre}}, - \code{\link{as.aggre}}, \code{\link{lexpand}}, - \code{\link{setaggre}} -} -\author{ -Joonas Miettinen -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/S3_definitions.R +\name{summary.aggre} +\alias{summary.aggre} +\title{Summarize an \code{aggre} Object} +\usage{ +\method{summary}{aggre}(object, by = NULL, subset = NULL, ...) +} +\arguments{ +\item{object}{an \code{aggre} object} + +\item{by}{list of columns to summarize by - e.g. \code{list(V1, V2)} +where \code{V1} and \code{V2} are columns in the data.} + +\item{subset}{a logical condition to subset results table by +before summarizing; use this to limit to a certain stratum. E.g. +\code{subset = sex == "male"}} + +\item{...}{unused} +} +\description{ +\code{summary} method function for \code{aggre} objects; see +\code{\link{as.aggre}} and \code{\link{aggre}}. +} +\seealso{ +Other aggregation functions: \code{\link{aggre}}, + \code{\link{as.aggre}}, \code{\link{lexpand}}, + \code{\link{setaggre}} +} +\author{ +Joonas Miettinen +} diff --git a/man/summary.survtab.Rd b/man/summary.survtab.Rd index 3f8c702..0eea1da 100644 --- a/man/summary.survtab.Rd +++ b/man/summary.survtab.Rd @@ -1,94 +1,94 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/S3_definitions.R -\name{summary.survtab} -\alias{summary.survtab} -\title{Summarize a survtab Object} -\usage{ -\method{summary}{survtab}(object, t = NULL, subset = NULL, q = NULL, ...) -} -\arguments{ -\item{object}{a \code{survtab} object} - -\item{t}{a vector of times at which time points (actually intervals that -contain t) to print summary table of survival function estimates by strata; -values not existing in any interval cause rows containing only \code{NAs} to -be returned.} - -\item{subset}{a logical condition to subset results table by -before printing; use this to limit to a certain stratum. E.g. -\code{subset = sex == "male"}} - -\item{q}{a named \code{list} of quantiles to include in returned data set, -where names must match to estimates in \code{object}; -returns intervals where the quantiles are reached first; -e.g. \code{list(surv.obs = 0.5)} finds the interval where \code{surv.obs} -is 0.45 and 0.55 at the beginning and end of the interval, respectively; -returns rows with \code{NA} values for quantiles not reached in estimates -(e.g. if \code{q = list(surv.obs = 0.5)} but lowest estimate is 0.6); -see Examples.} - -\item{...}{unused; required for congruence with other \code{summary} methods} -} -\description{ -Summary method function for \code{survtab} objects; see -\code{\link{survtab_ag}}. Returns estimates at given time points -or all time points if \code{t} and \code{q} are both \code{NULL}. -} -\details{ -Note that this function returns the intervals and NOT the time points -corresponding to quantiles / estimates corresponding to time points. -If you want precise estimates at time points that are not interval breaks, -add the time points as breaks and re-estimate the survival time function. -In interval-based estimation, the estimates denote e.g. probability of -dying \emph{during} the interval, so time points within the intervals -are not usually considered at all. See e.g. Seppa, Dyba, and Hakulinen -(2015). -} -\examples{ - -library(Epi) -library(survival) - -## NOTE: recommended to use factor status variable -x <- Lexis(entry = list(FUT = 0, AGE = dg_age, CAL = get.yrs(dg_date)), - exit = list(CAL = get.yrs(ex_date)), - data = sire[sire$dg_date < sire$ex_date, ], - exit.status = factor(status, levels = 0:2, - labels = c("alive", "canD", "othD")), - merge = TRUE) -## pretend some are male -set.seed(1L) -x$sex <- rbinom(nrow(x), 1, 0.5) -## observed survival -st <- survtab(Surv(time = FUT, event = lex.Xst) ~ sex, data = x, - surv.type = "cif.obs", - breaks = list(FUT = seq(0, 5, 1/12))) - -## estimates at full years of follow-up -summary(st, t = 1:5) - -## interval estimate closest to 75th percentile, i.e. -## first interval where surv.obs < 0.75 at end -## (just switch 0.75 to 0.5 for median survival, etc.) -summary(st, q = list(surv.obs = 0.75)) -## multiple quantiles -summary(st, q = list(surv.obs = c(0.75, 0.90), CIF_canD = 0.20)) - -## if you want all estimates in a new data.frame, you can also simply do - -x <- as.data.frame(st) - -} -\references{ -Seppa K., Dyba T. and Hakulinen T.: Cancer Survival, -Reference Module in Biomedical Sciences. Elsevier. 08-Jan-2015 -doi: 10.1016/B978-0-12-801238-3.02745-8. -} -\seealso{ -Other survtab functions: \code{\link{lines.survtab}}, - \code{\link{plot.survtab}}, \code{\link{print.survtab}}, - \code{\link{survtab_ag}}, \code{\link{survtab}} -} -\author{ -Joonas Miettinen -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/S3_definitions.R +\name{summary.survtab} +\alias{summary.survtab} +\title{Summarize a survtab Object} +\usage{ +\method{summary}{survtab}(object, t = NULL, subset = NULL, q = NULL, ...) +} +\arguments{ +\item{object}{a \code{survtab} object} + +\item{t}{a vector of times at which time points (actually intervals that +contain t) to print summary table of survival function estimates by strata; +values not existing in any interval cause rows containing only \code{NAs} to +be returned.} + +\item{subset}{a logical condition to subset results table by +before printing; use this to limit to a certain stratum. E.g. +\code{subset = sex == "male"}} + +\item{q}{a named \code{list} of quantiles to include in returned data set, +where names must match to estimates in \code{object}; +returns intervals where the quantiles are reached first; +e.g. \code{list(surv.obs = 0.5)} finds the interval where \code{surv.obs} +is 0.45 and 0.55 at the beginning and end of the interval, respectively; +returns rows with \code{NA} values for quantiles not reached in estimates +(e.g. if \code{q = list(surv.obs = 0.5)} but lowest estimate is 0.6); +see Examples.} + +\item{...}{unused; required for congruence with other \code{summary} methods} +} +\description{ +Summary method function for \code{survtab} objects; see +\code{\link{survtab_ag}}. Returns estimates at given time points +or all time points if \code{t} and \code{q} are both \code{NULL}. +} +\details{ +Note that this function returns the intervals and NOT the time points +corresponding to quantiles / estimates corresponding to time points. +If you want precise estimates at time points that are not interval breaks, +add the time points as breaks and re-estimate the survival time function. +In interval-based estimation, the estimates denote e.g. probability of +dying \emph{during} the interval, so time points within the intervals +are not usually considered at all. See e.g. Seppa, Dyba, and Hakulinen +(2015). +} +\examples{ + +library(Epi) +library(survival) + +## NOTE: recommended to use factor status variable +x <- Lexis(entry = list(FUT = 0, AGE = dg_age, CAL = get.yrs(dg_date)), + exit = list(CAL = get.yrs(ex_date)), + data = sire[sire$dg_date < sire$ex_date, ], + exit.status = factor(status, levels = 0:2, + labels = c("alive", "canD", "othD")), + merge = TRUE) +## pretend some are male +set.seed(1L) +x$sex <- rbinom(nrow(x), 1, 0.5) +## observed survival +st <- survtab(Surv(time = FUT, event = lex.Xst) ~ sex, data = x, + surv.type = "cif.obs", + breaks = list(FUT = seq(0, 5, 1/12))) + +## estimates at full years of follow-up +summary(st, t = 1:5) + +## interval estimate closest to 75th percentile, i.e. +## first interval where surv.obs < 0.75 at end +## (just switch 0.75 to 0.5 for median survival, etc.) +summary(st, q = list(surv.obs = 0.75)) +## multiple quantiles +summary(st, q = list(surv.obs = c(0.75, 0.90), CIF_canD = 0.20)) + +## if you want all estimates in a new data.frame, you can also simply do + +x <- as.data.frame(st) + +} +\references{ +Seppa K., Dyba T. and Hakulinen T.: Cancer Survival, +Reference Module in Biomedical Sciences. Elsevier. 08-Jan-2015 +doi: 10.1016/B978-0-12-801238-3.02745-8. +} +\seealso{ +Other survtab functions: \code{\link{lines.survtab}}, + \code{\link{plot.survtab}}, \code{\link{print.survtab}}, + \code{\link{survtab_ag}}, \code{\link{survtab}} +} +\author{ +Joonas Miettinen +} diff --git a/man/survmean.Rd b/man/survmean.Rd index 2dcab96..5598d51 100644 --- a/man/survmean.Rd +++ b/man/survmean.Rd @@ -1,280 +1,280 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/mean_survival.R -\name{survmean} -\alias{survmean} -\title{Compute Mean Survival Times Using Extrapolation} -\usage{ -survmean(formula, data, adjust = NULL, weights = NULL, breaks = NULL, - pophaz = NULL, e1.breaks = NULL, e1.pophaz = pophaz, r = "auto", - surv.method = "hazard", subset = NULL, verbose = FALSE) -} -\arguments{ -\item{formula}{a \code{formula}, e.g. \code{FUT ~ V1} or -\code{Surv(FUT, lex.Xst) ~ V1}. -Supplied in the same way as to \code{\link{survtab}}, see that help -for more info.} - -\item{data}{a \code{Lexis} data set; see \code{\link[Epi]{Lexis}}.} - -\item{adjust}{variables to adjust estimates by, e.g. \code{adjust = "agegr"}. -\link[=flexible_argument]{Flexible input}.} - -\item{weights}{weights to use to adjust mean survival times. See the -\link[=direct_standardization]{dedicated help page} for more details on -weighting. \code{survmean} -computes curves separately by all variables to adjust by, computes mean -survival times, and computes weighted means of the mean survival times. -See Examples.} - -\item{breaks}{a list of breaks defining the time window to compute -observed survival in, and the intervals used in estimation. E.g. -\code{list(FUT = 0:10)} when \code{FUT} is the follow-up time scale in your -data.} - -\item{pophaz}{a data set of population hazards passed to -\code{\link{survtab}} (see the -\link[=pophaz]{dedicated help page} and the help page of -\code{survtab} for more information). Defines the -population hazard in the time window where observed survival is estimated.} - -\item{e1.breaks}{\code{NULL} or a list of breaks defining the time -window to compute -\strong{expected} survival in, and the intervals used in estimation. E.g. -\code{list(FUT = 0:100)} when \code{FUT} is the follow-up time scale in your -data to extrapolate up to 100 years from where the observed survival -curve ends. \strong{NOTE:} the breaks on the survival time scale -MUST include the breaks supplied to argument \code{breaks}; see Examples. -If \code{NULL}, uses decent defaults (maximum follow-up time of 50 years).} - -\item{e1.pophaz}{Same as \code{pophaz}, except this defines the -population hazard in the time window where \strong{expected} -survival is estimated. By default uses the same data as -argument \code{pophaz}.} - -\item{r}{either a numeric multiplier such as \code{0.995}, \code{"auto"}, or -\code{"autoX"} where \code{X} is an integer; -used to determine the relative survival ratio (RSR) persisting after where -the estimated observed survival curve ends. See Details.} - -\item{surv.method}{passed to \code{survtab}; see that help for more info.} - -\item{subset}{a logical condition; e.g. \code{subset = sex == 1}; -subsets the data before computations} - -\item{verbose}{\code{logical}; if \code{TRUE}, the function is returns -some messages and results along the run, which may be useful in debugging} -} -\value{ -Returns a \code{data.frame} or \code{data.table} (depending on -\code{getOptions("popEpi.datatable")}; see \code{?popEpi}) containing the -following columns: -\itemize{ - \item{est}{: The estimated mean survival time} - \item{exp}{: The computed expected survival time} - \item{obs}{: Counts of subjects in data} - \item{YPLL}{: Years of Potential Life Lost, computed as - (\code{(exp-est)*obs}) - though your time data may be in e.g. days, - this column will have the same name regardless.} -} -The returned data also has columns named according to the variables -supplied to the right-hand-side of the formula. -} -\description{ -Computes mean survival times based on survival estimation up to -a point in follow-up time (e.g. 10 years), -after which survival is extrapolated -using an appropriate hazard data file (\code{pophaz}) to yield the "full" -survival curve. The area under the full survival curve is the mean survival. -} -\details{ -\strong{Basics} - -\code{survmean} computes mean survival times. For median survival times -(i.e. where 50 % of subjects have died or met some other event) -use \code{\link{survtab}}. - -The mean survival time is simply the area under the survival curve. -However, since full follow-up rarely happens, the observed survival curves -are extrapolated using expected survival: E.g. one might compute observed -survival till up to 10 years and extrapolate beyond that -(till e.g. 50 years) to yield an educated guess on the full observed survival -curve. - -The area is computed by trapezoidal integration of the area under the curve. -This function also computes the "full" expected survival curve from -T = 0 till e.g. T = 50 depending on supplied arguments. The -expected mean survival time is the area under the -mean expected survival curve. -This function returns the mean expected survival time to be compared with -the mean survival time and for computing years of potential life lost (YPLL). - -Results can be formed by strata and adjusted for e.g. age by using -the \code{formula} argument as in \code{survtab}. See also Examples. - -\strong{Extrapolation tweaks} - -Argument \code{r} controls the relative survival ratio (RSR) assumed to -persist beyond the time window where observed survival is computed -(defined by argument \code{breaks}; e.g. up to \code{FUT = 10}). -The RSR is simply \code{RSR_i = p_oi / p_ei} for a time interval \code{i}, -i.e. the observed divided by the expected -(conditional, not cumulative) probability of surviving from the beginning of -a time interval till its end. The cumulative product of \code{RSR_i} -over time is the (cumulative) relative survival curve. - - -If \code{r} is numeric, e.g. \code{r = 0.995}, that RSR level is assumed -to persist beyond the observed survival curve. -Numeric \code{r} should be \code{> 0} and expressed at the annual level -when using fractional years as the scale of the time variables. -E.g. if RSR is known to be \code{0.95} at the month level, then the -annualized RSR is \code{0.95^12}. This enables correct usage of the RSR -with survival intervals of varying lengths. When using day-level time -variables (such as \code{Dates}; see \code{as.Date}), numeric \code{r} -should be expressed at the day level, etc. - -If \code{r = "auto"} or \code{r = "auto1"}, this function computes -RSR estimates internally and automatically uses the \code{RSR_i} -in the last survival interval in each stratum (and adjusting group) -and assumes that to persist beyond the observed survival curve. -Automatic determination of \code{r} is a good starting point, -but in situations where the RSR estimate is uncertain it may produce poor -results. Using \code{"autoX"} such as \code{"auto6"} causes \code{survmean} -to use the mean of the estimated RSRs in the last X survival intervals, -which may be more stable. -Automatic determination will not use values \code{>1} but set them to 1. -Visual inspection of the produced curves is always recommended: see -Examples. - -One may also tweak the accuracy and length of extrapolation and -expected survival curve computation by using -\code{e1.breaks}. By default this is whatever was supplied to \code{breaks} -for the survival time scale, to which - -\code{c(seq(1/12, 1, 1/12), seq(1.2, 1.8, 0.2), 2:19, seq(20, 50, 5))} - -is added after the maximum value, e.g. with \code{breaks = list(FUT = 0:10)} -we have - -\code{..., 10+1/12, ..., 11, 11.2, ..., 2, 3, ..., 19, 20, 25, ... 50} - -as the \code{e1.breaks}. Supplying \code{e1.breaks} manually requires -the breaks over time survival time scale supplied to argument \code{breaks} -to be reiterated in \code{e1.breaks}; see Examples. \strong{NOTE}: the -default extrapolation breaks assume the time scales in the data to be -expressed as fractional years, meaning this will work extremely poorly -when using e.g. day-level time scales (such as \code{Date} variables). -Set the extrapolation breaks manually in such cases. -} -\examples{ - -library(survival) -library(Epi) -## take 500 subjects randomly for demonstration -data(sire) -sire <- sire[sire$dg_date < sire$ex_date, ] -set.seed(1L) -sire <- sire[sample(x = nrow(sire), size = 500),] - -## NOTE: recommended to use factor status variable -x <- Lexis(entry = list(FUT = 0, AGE = dg_age, CAL = get.yrs(dg_date)), - exit = list(CAL = get.yrs(ex_date)), - data = sire, - exit.status = factor(status, levels = 0:2, - labels = c("alive", "canD", "othD")), - merge = TRUE) - -## phony variable -set.seed(1L) -x$group <- rbinom(nrow(x), 1, 0.5) -## age group -x$agegr <- cut(x$dg_age, c(0,45,60,Inf), right=FALSE) - -## population hazards data set -pm <- data.frame(popEpi::popmort) -names(pm) <- c("sex", "CAL", "AGE", "haz") - -## breaks to define observed survival estimation -BL <- list(FUT = seq(0, 10, 1/12)) - -## crude mean survival -sm1 <- survmean(Surv(FUT, lex.Xst != "alive") ~ 1, - pophaz = pm, data = x, weights = NULL, - breaks = BL) - -sm1 <- survmean(FUT ~ 1, - pophaz = pm, data = x, weights = NULL, - breaks = BL) -\dontrun{ -## mean survival by group -sm2 <- survmean(FUT ~ group, - pophaz = pm, data = x, weights = NULL, - breaks = BL) - -## ... and adjusted for age using internal weights (counts of subjects) -## note: need also longer extrapolation here so that all curves -## converge to zero in the end. -eBL <- list(FUT = c(BL$FUT, 11:75)) -sm3 <- survmean(FUT ~ group + adjust(agegr), - pophaz = pm, data = x, weights = "internal", - breaks = BL, e1.breaks = eBL) -} -## visual inspection of how realistic extrapolation is for each stratum; -## solid lines are observed + extrapolated survivals; -## dashed lines are expected survivals -plot(sm1) -\dontrun{ -## plotting object with both stratification and standardization -## plots curves for each strata-std.group combination -plot(sm3) - -## for finer control of plotting these curves, you may extract -## from the survmean object using e.g. -attributes(sm3)$survmean.meta$curves - - -#### using Dates - -x <- Lexis(entry = list(FUT = 0L, AGE = dg_date-bi_date, CAL = dg_date), - exit = list(CAL = ex_date), - data = sire[sire$dg_date < sire$ex_date, ], - exit.status = factor(status, levels = 0:2, - labels = c("alive", "canD", "othD")), - merge = TRUE) -## phony group variable -set.seed(1L) -x$group <- rbinom(nrow(x), 1, 0.5) - - -## NOTE: population hazard should be reported at the same scale -## as time variables in your Lexis data. -data(popmort, package = "popEpi") -pm <- data.frame(popmort) -names(pm) <- c("sex", "CAL", "AGE", "haz") -## from year to day level -pm$haz <- pm$haz/365.25 -pm$CAL <- as.Date(paste0(pm$CAL, "-01-01")) -pm$AGE <- pm$AGE*365.25 - -BL <- list(FUT = seq(0, 8, 1/12)*365.25) -eBL <- list(FUT = c(BL$FUT, c(8.25,8.5,9:60)*365.25)) -smd <- survmean(FUT ~ group, data = x, - pophaz = pm, verbose = TRUE, r = "auto5", - breaks = BL, e1.breaks = eBL) -plot(smd) -} - - -} -\seealso{ -Other survmean functions: \code{\link{lines.survmean}}, - \code{\link{plot.survmean}} - -Other main functions: \code{\link{rate}}, - \code{\link{relpois_ag}}, \code{\link{relpois}}, - \code{\link{sirspline}}, \code{\link{sir}}, - \code{\link{survtab_ag}}, \code{\link{survtab}} -} -\author{ -Joonas Miettinen -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/mean_survival.R +\name{survmean} +\alias{survmean} +\title{Compute Mean Survival Times Using Extrapolation} +\usage{ +survmean(formula, data, adjust = NULL, weights = NULL, breaks = NULL, + pophaz = NULL, e1.breaks = NULL, e1.pophaz = pophaz, r = "auto", + surv.method = "hazard", subset = NULL, verbose = FALSE) +} +\arguments{ +\item{formula}{a \code{formula}, e.g. \code{FUT ~ V1} or +\code{Surv(FUT, lex.Xst) ~ V1}. +Supplied in the same way as to \code{\link{survtab}}, see that help +for more info.} + +\item{data}{a \code{Lexis} data set; see \code{\link[Epi]{Lexis}}.} + +\item{adjust}{variables to adjust estimates by, e.g. \code{adjust = "agegr"}. +\link[=flexible_argument]{Flexible input}.} + +\item{weights}{weights to use to adjust mean survival times. See the +\link[=direct_standardization]{dedicated help page} for more details on +weighting. \code{survmean} +computes curves separately by all variables to adjust by, computes mean +survival times, and computes weighted means of the mean survival times. +See Examples.} + +\item{breaks}{a list of breaks defining the time window to compute +observed survival in, and the intervals used in estimation. E.g. +\code{list(FUT = 0:10)} when \code{FUT} is the follow-up time scale in your +data.} + +\item{pophaz}{a data set of population hazards passed to +\code{\link{survtab}} (see the +\link[=pophaz]{dedicated help page} and the help page of +\code{survtab} for more information). Defines the +population hazard in the time window where observed survival is estimated.} + +\item{e1.breaks}{\code{NULL} or a list of breaks defining the time +window to compute +\strong{expected} survival in, and the intervals used in estimation. E.g. +\code{list(FUT = 0:100)} when \code{FUT} is the follow-up time scale in your +data to extrapolate up to 100 years from where the observed survival +curve ends. \strong{NOTE:} the breaks on the survival time scale +MUST include the breaks supplied to argument \code{breaks}; see Examples. +If \code{NULL}, uses decent defaults (maximum follow-up time of 50 years).} + +\item{e1.pophaz}{Same as \code{pophaz}, except this defines the +population hazard in the time window where \strong{expected} +survival is estimated. By default uses the same data as +argument \code{pophaz}.} + +\item{r}{either a numeric multiplier such as \code{0.995}, \code{"auto"}, or +\code{"autoX"} where \code{X} is an integer; +used to determine the relative survival ratio (RSR) persisting after where +the estimated observed survival curve ends. See Details.} + +\item{surv.method}{passed to \code{survtab}; see that help for more info.} + +\item{subset}{a logical condition; e.g. \code{subset = sex == 1}; +subsets the data before computations} + +\item{verbose}{\code{logical}; if \code{TRUE}, the function is returns +some messages and results along the run, which may be useful in debugging} +} +\value{ +Returns a \code{data.frame} or \code{data.table} (depending on +\code{getOptions("popEpi.datatable")}; see \code{?popEpi}) containing the +following columns: +\itemize{ + \item{est}{: The estimated mean survival time} + \item{exp}{: The computed expected survival time} + \item{obs}{: Counts of subjects in data} + \item{YPLL}{: Years of Potential Life Lost, computed as + (\code{(exp-est)*obs}) - though your time data may be in e.g. days, + this column will have the same name regardless.} +} +The returned data also has columns named according to the variables +supplied to the right-hand-side of the formula. +} +\description{ +Computes mean survival times based on survival estimation up to +a point in follow-up time (e.g. 10 years), +after which survival is extrapolated +using an appropriate hazard data file (\code{pophaz}) to yield the "full" +survival curve. The area under the full survival curve is the mean survival. +} +\details{ +\strong{Basics} + +\code{survmean} computes mean survival times. For median survival times +(i.e. where 50 % of subjects have died or met some other event) +use \code{\link{survtab}}. + +The mean survival time is simply the area under the survival curve. +However, since full follow-up rarely happens, the observed survival curves +are extrapolated using expected survival: E.g. one might compute observed +survival till up to 10 years and extrapolate beyond that +(till e.g. 50 years) to yield an educated guess on the full observed survival +curve. + +The area is computed by trapezoidal integration of the area under the curve. +This function also computes the "full" expected survival curve from +T = 0 till e.g. T = 50 depending on supplied arguments. The +expected mean survival time is the area under the +mean expected survival curve. +This function returns the mean expected survival time to be compared with +the mean survival time and for computing years of potential life lost (YPLL). + +Results can be formed by strata and adjusted for e.g. age by using +the \code{formula} argument as in \code{survtab}. See also Examples. + +\strong{Extrapolation tweaks} + +Argument \code{r} controls the relative survival ratio (RSR) assumed to +persist beyond the time window where observed survival is computed +(defined by argument \code{breaks}; e.g. up to \code{FUT = 10}). +The RSR is simply \code{RSR_i = p_oi / p_ei} for a time interval \code{i}, +i.e. the observed divided by the expected +(conditional, not cumulative) probability of surviving from the beginning of +a time interval till its end. The cumulative product of \code{RSR_i} +over time is the (cumulative) relative survival curve. + + +If \code{r} is numeric, e.g. \code{r = 0.995}, that RSR level is assumed +to persist beyond the observed survival curve. +Numeric \code{r} should be \code{> 0} and expressed at the annual level +when using fractional years as the scale of the time variables. +E.g. if RSR is known to be \code{0.95} at the month level, then the +annualized RSR is \code{0.95^12}. This enables correct usage of the RSR +with survival intervals of varying lengths. When using day-level time +variables (such as \code{Dates}; see \code{as.Date}), numeric \code{r} +should be expressed at the day level, etc. + +If \code{r = "auto"} or \code{r = "auto1"}, this function computes +RSR estimates internally and automatically uses the \code{RSR_i} +in the last survival interval in each stratum (and adjusting group) +and assumes that to persist beyond the observed survival curve. +Automatic determination of \code{r} is a good starting point, +but in situations where the RSR estimate is uncertain it may produce poor +results. Using \code{"autoX"} such as \code{"auto6"} causes \code{survmean} +to use the mean of the estimated RSRs in the last X survival intervals, +which may be more stable. +Automatic determination will not use values \code{>1} but set them to 1. +Visual inspection of the produced curves is always recommended: see +Examples. + +One may also tweak the accuracy and length of extrapolation and +expected survival curve computation by using +\code{e1.breaks}. By default this is whatever was supplied to \code{breaks} +for the survival time scale, to which + +\code{c(seq(1/12, 1, 1/12), seq(1.2, 1.8, 0.2), 2:19, seq(20, 50, 5))} + +is added after the maximum value, e.g. with \code{breaks = list(FUT = 0:10)} +we have + +\code{..., 10+1/12, ..., 11, 11.2, ..., 2, 3, ..., 19, 20, 25, ... 50} + +as the \code{e1.breaks}. Supplying \code{e1.breaks} manually requires +the breaks over time survival time scale supplied to argument \code{breaks} +to be reiterated in \code{e1.breaks}; see Examples. \strong{NOTE}: the +default extrapolation breaks assume the time scales in the data to be +expressed as fractional years, meaning this will work extremely poorly +when using e.g. day-level time scales (such as \code{Date} variables). +Set the extrapolation breaks manually in such cases. +} +\examples{ + +library(survival) +library(Epi) +## take 500 subjects randomly for demonstration +data(sire) +sire <- sire[sire$dg_date < sire$ex_date, ] +set.seed(1L) +sire <- sire[sample(x = nrow(sire), size = 500),] + +## NOTE: recommended to use factor status variable +x <- Lexis(entry = list(FUT = 0, AGE = dg_age, CAL = get.yrs(dg_date)), + exit = list(CAL = get.yrs(ex_date)), + data = sire, + exit.status = factor(status, levels = 0:2, + labels = c("alive", "canD", "othD")), + merge = TRUE) + +## phony variable +set.seed(1L) +x$group <- rbinom(nrow(x), 1, 0.5) +## age group +x$agegr <- cut(x$dg_age, c(0,45,60,Inf), right=FALSE) + +## population hazards data set +pm <- data.frame(popEpi::popmort) +names(pm) <- c("sex", "CAL", "AGE", "haz") + +## breaks to define observed survival estimation +BL <- list(FUT = seq(0, 10, 1/12)) + +## crude mean survival +sm1 <- survmean(Surv(FUT, lex.Xst != "alive") ~ 1, + pophaz = pm, data = x, weights = NULL, + breaks = BL) + +sm1 <- survmean(FUT ~ 1, + pophaz = pm, data = x, weights = NULL, + breaks = BL) +\dontrun{ +## mean survival by group +sm2 <- survmean(FUT ~ group, + pophaz = pm, data = x, weights = NULL, + breaks = BL) + +## ... and adjusted for age using internal weights (counts of subjects) +## note: need also longer extrapolation here so that all curves +## converge to zero in the end. +eBL <- list(FUT = c(BL$FUT, 11:75)) +sm3 <- survmean(FUT ~ group + adjust(agegr), + pophaz = pm, data = x, weights = "internal", + breaks = BL, e1.breaks = eBL) +} +## visual inspection of how realistic extrapolation is for each stratum; +## solid lines are observed + extrapolated survivals; +## dashed lines are expected survivals +plot(sm1) +\dontrun{ +## plotting object with both stratification and standardization +## plots curves for each strata-std.group combination +plot(sm3) + +## for finer control of plotting these curves, you may extract +## from the survmean object using e.g. +attributes(sm3)$survmean.meta$curves + + +#### using Dates + +x <- Lexis(entry = list(FUT = 0L, AGE = dg_date-bi_date, CAL = dg_date), + exit = list(CAL = ex_date), + data = sire[sire$dg_date < sire$ex_date, ], + exit.status = factor(status, levels = 0:2, + labels = c("alive", "canD", "othD")), + merge = TRUE) +## phony group variable +set.seed(1L) +x$group <- rbinom(nrow(x), 1, 0.5) + + +## NOTE: population hazard should be reported at the same scale +## as time variables in your Lexis data. +data(popmort, package = "popEpi") +pm <- data.frame(popmort) +names(pm) <- c("sex", "CAL", "AGE", "haz") +## from year to day level +pm$haz <- pm$haz/365.25 +pm$CAL <- as.Date(paste0(pm$CAL, "-01-01")) +pm$AGE <- pm$AGE*365.25 + +BL <- list(FUT = seq(0, 8, 1/12)*365.25) +eBL <- list(FUT = c(BL$FUT, c(8.25,8.5,9:60)*365.25)) +smd <- survmean(FUT ~ group, data = x, + pophaz = pm, verbose = TRUE, r = "auto5", + breaks = BL, e1.breaks = eBL) +plot(smd) +} + + +} +\seealso{ +Other survmean functions: \code{\link{lines.survmean}}, + \code{\link{plot.survmean}} + +Other main functions: \code{\link{rate}}, + \code{\link{relpois_ag}}, \code{\link{relpois}}, + \code{\link{sirspline}}, \code{\link{sir}}, + \code{\link{survtab_ag}}, \code{\link{survtab}} +} +\author{ +Joonas Miettinen +} diff --git a/man/survtab.Rd b/man/survtab.Rd index 6e8861d..cbe3b06 100644 --- a/man/survtab.Rd +++ b/man/survtab.Rd @@ -1,308 +1,308 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/survival_lexis.R -\name{survtab} -\alias{survtab} -\title{Estimate Survival Time Functions} -\usage{ -survtab(formula, data, adjust = NULL, breaks = NULL, pophaz = NULL, - weights = NULL, surv.type = "surv.rel", surv.method = "hazard", - relsurv.method = "e2", subset = NULL, conf.level = 0.95, - conf.type = "log-log", verbose = FALSE) -} -\arguments{ -\item{formula}{a \code{formula}; e.g. \code{fot ~ sex}, -where \code{fot} is the time scale over which you wish to estimate a -survival time function; this -assumes that \code{lex.Xst} in your data is the status variable in the -intended format (almost always right). -To be explicit, use \code{\link[survival]{Surv}}: e.g. -\code{Surv(fot, lex.Xst) ~ sex}. -Variables on the right-hand side of the formula -separated by \code{+} are considered stratifying variables, for which -estimates are computed separately. May contain usage of \code{adjust()} ---- see Details and Examples.} - -\item{data}{a \code{Lexis} object with at least the survival time scale} - -\item{adjust}{can be used as an alternative to passing variables to -argument \code{formula} within a call to \code{adjust()}; e.g. -\code{adjust = "agegr"}. \link[=flexible_argument]{Flexible input}.} - -\item{breaks}{a named list of breaks, e.g. -\code{list(FUT = 0:5)}. If data is not split in advance, \code{breaks} -must at the very least contain a vector of breaks to split the survival time -scale (mentioned in argument \code{formula}). If data has already been split -(using e.g. \code{\link{splitMulti}}) along at least the used survival time -scale, this may be \code{NULL}. It is generally recommended (and sufficient; -see Seppa, Dyban and Hakulinen (2015)) to use monthly -intervals where applicable.} - -\item{pophaz}{a \code{data.frame} containing -expected hazards for the event of interest to occur. See the -\link[=pophaz]{dedicated help page}. Required when -\code{surv.type = "surv.rel"} or \code{"cif.rel"}. \code{pophaz} must -contain one column named \code{"haz"}, and any number of other columns -identifying levels of variables to do a merge with split data within -\code{survtab}. Some columns may be time scales, which will -allow for the expected hazard to vary by e.g. calendar time and age.} - -\item{weights}{typically a list of weights or a \code{character} string -specifying an age group standardization scheme; see -the \link[=direct_standardization]{dedicated help page} -and examples. NOTE: \code{weights = "internal"} is based on the counts -of persons in follow-up at the start of follow-up (typically T = 0)} - -\item{surv.type}{one of \code{'surv.obs'}, -\code{'surv.cause'}, \code{'surv.rel'}, -\code{'cif.obs'} or \code{'cif.rel'}; -defines what kind of survival time function(s) is/are estimated; see Details} - -\item{surv.method}{either \code{'lifetable'} or \code{'hazard'}; determines -the method of calculating survival time functions, where the former computes -ratios such as \code{p = d/(n - n.cens)} -and the latter utilizes subject-times -(typically person-years) for hazard estimates such as \code{d/pyrs} -which are used to compute survival time function estimates. -The former method requires argument \code{n.cens} and the latter -argument \code{pyrs} to be supplied.} - -\item{relsurv.method}{either \code{'e2'} or \code{'pp'}; -defines whether to compute relative survival using the -EdererII method or using Pohar-Perme weighting; -ignored if \code{surv.type != "surv.rel"}} - -\item{subset}{a logical condition; e.g. \code{subset = sex == 1}; -subsets the data before computations} - -\item{conf.level}{confidence level used in confidence intervals; -e.g. \code{0.95} for 95 percent confidence intervals} - -\item{conf.type}{character string; must be one of \code{"plain"}, -\code{"log-log"} and \code{"log"}; -defines the transformation used on the survival time -function to yield confidence -intervals via the delta method} - -\item{verbose}{logical; if \code{TRUE}, the function is chatty and -returns some messages and timings along the process} -} -\value{ -Returns a table of life time function values and other -information with survival intervals as rows. -Returns some of the following estimates of survival time functions: - -\itemize{ - \item \code{surv.obs} - observed (raw, overall) survival - \item \code{surv.obs.K} - observed cause-specific survival for cause K - \item \code{CIF_k} - cumulative incidence function for cause \code{k} - \item \code{CIF.rel} - cumulative incidence function using excess cases - \item \code{r.e2} - relative survival, EdererII - \item \code{r.pp} - relative survival, Pohar-Perme weighted -} -The suffix \code{.as} implies adjusted estimates, and \code{.lo} and -\code{.hi} imply lower and upper confidence limits, respectively. -The prefix \code{SE.} stands for standard error. -} -\description{ -This function estimates survival time functions: survival, -relative/net survival, and crude/absolute risk functions (CIF). -} -\section{Basics}{ - - -This function computes interval-based estimates of survival time functions, -where the intervals are set by the user. For product-limit-based -estimation see packages \pkg{survival} and \pkg{relsurv}. - -if \code{surv.type = 'surv.obs'}, only 'raw' observed survival -is estimated over the chosen time intervals. With -\code{surv.type = 'surv.rel'}, also relative survival estimates -are supplied in addition to observed survival figures. - -\code{surv.type = 'cif.obs'} requests cumulative incidence functions (CIF) -to be estimated. -CIFs are estimated for each competing risk based -on a survival-interval-specific proportional hazards -assumption as described by Chiang (1968). -With \code{surv.type = 'cif.rel'}, a CIF is estimated with using -excess cases as the ''cause-specific'' cases. Finally, with -\code{surv.type = 'surv.cause'}, cause-specific survivals are -estimated separately for each separate type of event. - -In hazard-based estimation (\code{surv.method = "hazard"}) survival -time functions are transformations of the estimated corresponding hazard -in the intervals. The hazard itself is estimated using counts of events -(or excess events) and total subject-time in the interval. Life table -\code{surv.method = "lifetable"} estimates are constructed as transformations -of probabilities computed using counts of events and counts of subjects -at risk. - - -The vignette \href{../doc/survtab_examples.html}{survtab_examples} -has some practical examples. -} - -\section{Relative survival}{ - - -When \code{surv.type = 'surv.rel'}, the user can choose -\code{relsurv.method = 'pp'}, whereupon Pohar-Perme weighting is used. -By default \code{relsurv.method = 'e2'}, i.e. the Ederer II method -is used to estimate relative survival. -} - -\section{Adjusted estimates}{ - - -Adjusted estimates in this context mean computing estimates separately -by the levels of adjusting variables and returning weighted averages -of the estimates. For example, computing estimates separately by -age groups and returning a weighted average estimate (age-adjusted estimate). - -Adjusting requires specification of both the adjusting variables and -the weights for all the levels of the adjusting variables. The former can be -accomplished by using \code{adjust()} with the argument \code{formula}, -or by supplying variables directly to argument \code{adjust}. E.g. the -following are all equivalent: - -\code{formula = fot ~ sex + adjust(agegr) + adjust(area)} - -\code{formula = fot ~ sex + adjust(agegr, area)} - -\code{formula = fot ~ sex, adjust = c("agegr", "area")} - -\code{formula = fot ~ sex, adjust = list(agegr, area)} - -The adjusting variables must match with the variable names in the -argument \code{weights}; -see the \link[=direct_standardization]{dedicated help page}. -Typically weights are supplied as a \code{list} or -a \code{data.frame}. The former can be done by e.g. - -\code{weights = list(agegr = VEC1, area = VEC2)}, - -where \code{VEC1} and \code{VEC2} are vectors of weights (which do not -have to add up to one). See -\href{../doc/survtab_examples.html}{survtab_examples} -for an example of using a \code{data.frame} to pass weights. -} - -\section{Period analysis and other data selection schemes}{ - - -To calculate e.g. period analysis (delayed entry) estimates, -limit the data when/before supplying to this function.See -\href{../doc/survtab_examples.html}{survtab_examples}. -} - -\examples{ -\dontrun{ -data("sire", package = "popEpi") -library(Epi) -library(survival) - -## NOTE: recommended to use factor status variable -x <- Lexis(entry = list(FUT = 0, AGE = dg_age, CAL = get.yrs(dg_date)), - exit = list(CAL = get.yrs(ex_date)), - data = sire[sire$dg_date < sire$ex_date, ], - exit.status = factor(status, levels = 0:2, - labels = c("alive", "canD", "othD")), - merge = TRUE) - -## phony group variable -set.seed(1L) -x$group <- rbinom(nrow(x), 1, 0.5) - -## observed survival. explicit supplying of status: -st <- survtab(Surv(time = FUT, event = lex.Xst) ~ group, data = x, - surv.type = "surv.obs", - breaks = list(FUT = seq(0, 5, 1/12))) -## this assumes the status is lex.Xst (right 99.9 \% of the time) -st <- survtab(FUT ~ group, data = x, - surv.type = "surv.obs", - breaks = list(FUT = seq(0, 5, 1/12))) - -## relative survival (ederer II) -data("popmort", package = "popEpi") -pm <- data.frame(popmort) -names(pm) <- c("sex", "CAL", "AGE", "haz") -st <- survtab(FUT ~ group, data = x, - surv.type = "surv.rel", - pophaz = pm, - breaks = list(FUT = seq(0, 5, 1/12))) - -## ICSS weights usage -data("ICSS", package = "popEpi") -cut <- c(0, 30, 50, 70, Inf) -agegr <- cut(ICSS$age, cut, right = FALSE) -w <- aggregate(ICSS1~agegr, data = ICSS, FUN = sum) -x$agegr <- cut(x$dg_age, cut, right = FALSE) -st <- survtab(FUT ~ group + adjust(agegr), data = x, - surv.type = "surv.rel", - pophaz = pm, weights = w$ICSS1, - breaks = list(FUT = seq(0, 5, 1/12))) - -#### using dates with survtab -x <- Lexis(entry = list(FUT = 0L, AGE = dg_date-bi_date, CAL = dg_date), - exit = list(CAL = ex_date), - data = sire[sire$dg_date < sire$ex_date, ], - exit.status = factor(status, levels = 0:2, - labels = c("alive", "canD", "othD")), - merge = TRUE) -## phony group variable -set.seed(1L) -x$group <- rbinom(nrow(x), 1, 0.5) - -st <- survtab(Surv(time = FUT, event = lex.Xst) ~ group, data = x, - surv.type = "surv.obs", - breaks = list(FUT = seq(0, 5, 1/12)*365.25)) - -## NOTE: population hazard should be reported at the same scale -## as time variables in your Lexis data. -data(popmort, package = "popEpi") -pm <- data.frame(popmort) -names(pm) <- c("sex", "CAL", "AGE", "haz") -## from year to day level -pm$haz <- pm$haz/365.25 -pm$CAL <- as.Date(paste0(pm$CAL, "-01-01")) -pm$AGE <- pm$AGE*365.25 - -st <- survtab(Surv(time = FUT, event = lex.Xst) ~ group, data = x, - surv.type = "surv.rel", relsurv.method = "e2", - pophaz = pm, - breaks = list(FUT = seq(0, 5, 1/12)*365.25)) -} -} -\references{ -Perme, Maja Pohar, Janez Stare, and Jacques Estève. -"On estimation in relative survival." Biometrics 68.1 (2012): 113-120. - -Hakulinen, Timo, Karri Seppa, and Paul C. Lambert. -"Choosing the relative survival method for cancer survival estimation." -European Journal of Cancer 47.14 (2011): 2202-2210. - -Seppa, Karri, Timo Hakulinen, and Arun Pokhrel. -"Choosing the net survival method for cancer survival estimation." -European Journal of Cancer (2013). - -CHIANG, Chin Long. Introduction to stochastic processes in biostatistics. -1968. - -Seppa K., Dyba T. and Hakulinen T.: Cancer Survival, -Reference Module in Biomedical Sciences. Elsevier. 08-Jan-2015 -doi: 10.1016/B978-0-12-801238-3.02745-8. -} -\seealso{ -\code{\link{splitMulti}}, \code{\link{lexpand}}, -\code{\link{ICSS}}, \code{\link{sire}} -\href{../doc/survtab_examples.html}{The survtab_examples vignette} - -Other main functions: \code{\link{rate}}, - \code{\link{relpois_ag}}, \code{\link{relpois}}, - \code{\link{sirspline}}, \code{\link{sir}}, - \code{\link{survmean}}, \code{\link{survtab_ag}} - -Other survtab functions: \code{\link{lines.survtab}}, - \code{\link{plot.survtab}}, \code{\link{print.survtab}}, - \code{\link{summary.survtab}}, \code{\link{survtab_ag}} -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/survival_lexis.R +\name{survtab} +\alias{survtab} +\title{Estimate Survival Time Functions} +\usage{ +survtab(formula, data, adjust = NULL, breaks = NULL, pophaz = NULL, + weights = NULL, surv.type = "surv.rel", surv.method = "hazard", + relsurv.method = "e2", subset = NULL, conf.level = 0.95, + conf.type = "log-log", verbose = FALSE) +} +\arguments{ +\item{formula}{a \code{formula}; e.g. \code{fot ~ sex}, +where \code{fot} is the time scale over which you wish to estimate a +survival time function; this +assumes that \code{lex.Xst} in your data is the status variable in the +intended format (almost always right). +To be explicit, use \code{\link[survival]{Surv}}: e.g. +\code{Surv(fot, lex.Xst) ~ sex}. +Variables on the right-hand side of the formula +separated by \code{+} are considered stratifying variables, for which +estimates are computed separately. May contain usage of \code{adjust()} +--- see Details and Examples.} + +\item{data}{a \code{Lexis} object with at least the survival time scale} + +\item{adjust}{can be used as an alternative to passing variables to +argument \code{formula} within a call to \code{adjust()}; e.g. +\code{adjust = "agegr"}. \link[=flexible_argument]{Flexible input}.} + +\item{breaks}{a named list of breaks, e.g. +\code{list(FUT = 0:5)}. If data is not split in advance, \code{breaks} +must at the very least contain a vector of breaks to split the survival time +scale (mentioned in argument \code{formula}). If data has already been split +(using e.g. \code{\link{splitMulti}}) along at least the used survival time +scale, this may be \code{NULL}. It is generally recommended (and sufficient; +see Seppa, Dyban and Hakulinen (2015)) to use monthly +intervals where applicable.} + +\item{pophaz}{a \code{data.frame} containing +expected hazards for the event of interest to occur. See the +\link[=pophaz]{dedicated help page}. Required when +\code{surv.type = "surv.rel"} or \code{"cif.rel"}. \code{pophaz} must +contain one column named \code{"haz"}, and any number of other columns +identifying levels of variables to do a merge with split data within +\code{survtab}. Some columns may be time scales, which will +allow for the expected hazard to vary by e.g. calendar time and age.} + +\item{weights}{typically a list of weights or a \code{character} string +specifying an age group standardization scheme; see +the \link[=direct_standardization]{dedicated help page} +and examples. NOTE: \code{weights = "internal"} is based on the counts +of persons in follow-up at the start of follow-up (typically T = 0)} + +\item{surv.type}{one of \code{'surv.obs'}, +\code{'surv.cause'}, \code{'surv.rel'}, +\code{'cif.obs'} or \code{'cif.rel'}; +defines what kind of survival time function(s) is/are estimated; see Details} + +\item{surv.method}{either \code{'lifetable'} or \code{'hazard'}; determines +the method of calculating survival time functions, where the former computes +ratios such as \code{p = d/(n - n.cens)} +and the latter utilizes subject-times +(typically person-years) for hazard estimates such as \code{d/pyrs} +which are used to compute survival time function estimates. +The former method requires argument \code{n.cens} and the latter +argument \code{pyrs} to be supplied.} + +\item{relsurv.method}{either \code{'e2'} or \code{'pp'}; +defines whether to compute relative survival using the +EdererII method or using Pohar-Perme weighting; +ignored if \code{surv.type != "surv.rel"}} + +\item{subset}{a logical condition; e.g. \code{subset = sex == 1}; +subsets the data before computations} + +\item{conf.level}{confidence level used in confidence intervals; +e.g. \code{0.95} for 95 percent confidence intervals} + +\item{conf.type}{character string; must be one of \code{"plain"}, +\code{"log-log"} and \code{"log"}; +defines the transformation used on the survival time +function to yield confidence +intervals via the delta method} + +\item{verbose}{logical; if \code{TRUE}, the function is chatty and +returns some messages and timings along the process} +} +\value{ +Returns a table of life time function values and other +information with survival intervals as rows. +Returns some of the following estimates of survival time functions: + +\itemize{ + \item \code{surv.obs} - observed (raw, overall) survival + \item \code{surv.obs.K} - observed cause-specific survival for cause K + \item \code{CIF_k} - cumulative incidence function for cause \code{k} + \item \code{CIF.rel} - cumulative incidence function using excess cases + \item \code{r.e2} - relative survival, EdererII + \item \code{r.pp} - relative survival, Pohar-Perme weighted +} +The suffix \code{.as} implies adjusted estimates, and \code{.lo} and +\code{.hi} imply lower and upper confidence limits, respectively. +The prefix \code{SE.} stands for standard error. +} +\description{ +This function estimates survival time functions: survival, +relative/net survival, and crude/absolute risk functions (CIF). +} +\section{Basics}{ + + +This function computes interval-based estimates of survival time functions, +where the intervals are set by the user. For product-limit-based +estimation see packages \pkg{survival} and \pkg{relsurv}. + +if \code{surv.type = 'surv.obs'}, only 'raw' observed survival +is estimated over the chosen time intervals. With +\code{surv.type = 'surv.rel'}, also relative survival estimates +are supplied in addition to observed survival figures. + +\code{surv.type = 'cif.obs'} requests cumulative incidence functions (CIF) +to be estimated. +CIFs are estimated for each competing risk based +on a survival-interval-specific proportional hazards +assumption as described by Chiang (1968). +With \code{surv.type = 'cif.rel'}, a CIF is estimated with using +excess cases as the ''cause-specific'' cases. Finally, with +\code{surv.type = 'surv.cause'}, cause-specific survivals are +estimated separately for each separate type of event. + +In hazard-based estimation (\code{surv.method = "hazard"}) survival +time functions are transformations of the estimated corresponding hazard +in the intervals. The hazard itself is estimated using counts of events +(or excess events) and total subject-time in the interval. Life table +\code{surv.method = "lifetable"} estimates are constructed as transformations +of probabilities computed using counts of events and counts of subjects +at risk. + + +The vignette \href{../doc/survtab_examples.html}{survtab_examples} +has some practical examples. +} + +\section{Relative survival}{ + + +When \code{surv.type = 'surv.rel'}, the user can choose +\code{relsurv.method = 'pp'}, whereupon Pohar-Perme weighting is used. +By default \code{relsurv.method = 'e2'}, i.e. the Ederer II method +is used to estimate relative survival. +} + +\section{Adjusted estimates}{ + + +Adjusted estimates in this context mean computing estimates separately +by the levels of adjusting variables and returning weighted averages +of the estimates. For example, computing estimates separately by +age groups and returning a weighted average estimate (age-adjusted estimate). + +Adjusting requires specification of both the adjusting variables and +the weights for all the levels of the adjusting variables. The former can be +accomplished by using \code{adjust()} with the argument \code{formula}, +or by supplying variables directly to argument \code{adjust}. E.g. the +following are all equivalent: + +\code{formula = fot ~ sex + adjust(agegr) + adjust(area)} + +\code{formula = fot ~ sex + adjust(agegr, area)} + +\code{formula = fot ~ sex, adjust = c("agegr", "area")} + +\code{formula = fot ~ sex, adjust = list(agegr, area)} + +The adjusting variables must match with the variable names in the +argument \code{weights}; +see the \link[=direct_standardization]{dedicated help page}. +Typically weights are supplied as a \code{list} or +a \code{data.frame}. The former can be done by e.g. + +\code{weights = list(agegr = VEC1, area = VEC2)}, + +where \code{VEC1} and \code{VEC2} are vectors of weights (which do not +have to add up to one). See +\href{../doc/survtab_examples.html}{survtab_examples} +for an example of using a \code{data.frame} to pass weights. +} + +\section{Period analysis and other data selection schemes}{ + + +To calculate e.g. period analysis (delayed entry) estimates, +limit the data when/before supplying to this function.See +\href{../doc/survtab_examples.html}{survtab_examples}. +} + +\examples{ +\dontrun{ +data("sire", package = "popEpi") +library(Epi) +library(survival) + +## NOTE: recommended to use factor status variable +x <- Lexis(entry = list(FUT = 0, AGE = dg_age, CAL = get.yrs(dg_date)), + exit = list(CAL = get.yrs(ex_date)), + data = sire[sire$dg_date < sire$ex_date, ], + exit.status = factor(status, levels = 0:2, + labels = c("alive", "canD", "othD")), + merge = TRUE) + +## phony group variable +set.seed(1L) +x$group <- rbinom(nrow(x), 1, 0.5) + +## observed survival. explicit supplying of status: +st <- survtab(Surv(time = FUT, event = lex.Xst) ~ group, data = x, + surv.type = "surv.obs", + breaks = list(FUT = seq(0, 5, 1/12))) +## this assumes the status is lex.Xst (right 99.9 \% of the time) +st <- survtab(FUT ~ group, data = x, + surv.type = "surv.obs", + breaks = list(FUT = seq(0, 5, 1/12))) + +## relative survival (ederer II) +data("popmort", package = "popEpi") +pm <- data.frame(popmort) +names(pm) <- c("sex", "CAL", "AGE", "haz") +st <- survtab(FUT ~ group, data = x, + surv.type = "surv.rel", + pophaz = pm, + breaks = list(FUT = seq(0, 5, 1/12))) + +## ICSS weights usage +data("ICSS", package = "popEpi") +cut <- c(0, 30, 50, 70, Inf) +agegr <- cut(ICSS$age, cut, right = FALSE) +w <- aggregate(ICSS1~agegr, data = ICSS, FUN = sum) +x$agegr <- cut(x$dg_age, cut, right = FALSE) +st <- survtab(FUT ~ group + adjust(agegr), data = x, + surv.type = "surv.rel", + pophaz = pm, weights = w$ICSS1, + breaks = list(FUT = seq(0, 5, 1/12))) + +#### using dates with survtab +x <- Lexis(entry = list(FUT = 0L, AGE = dg_date-bi_date, CAL = dg_date), + exit = list(CAL = ex_date), + data = sire[sire$dg_date < sire$ex_date, ], + exit.status = factor(status, levels = 0:2, + labels = c("alive", "canD", "othD")), + merge = TRUE) +## phony group variable +set.seed(1L) +x$group <- rbinom(nrow(x), 1, 0.5) + +st <- survtab(Surv(time = FUT, event = lex.Xst) ~ group, data = x, + surv.type = "surv.obs", + breaks = list(FUT = seq(0, 5, 1/12)*365.25)) + +## NOTE: population hazard should be reported at the same scale +## as time variables in your Lexis data. +data(popmort, package = "popEpi") +pm <- data.frame(popmort) +names(pm) <- c("sex", "CAL", "AGE", "haz") +## from year to day level +pm$haz <- pm$haz/365.25 +pm$CAL <- as.Date(paste0(pm$CAL, "-01-01")) +pm$AGE <- pm$AGE*365.25 + +st <- survtab(Surv(time = FUT, event = lex.Xst) ~ group, data = x, + surv.type = "surv.rel", relsurv.method = "e2", + pophaz = pm, + breaks = list(FUT = seq(0, 5, 1/12)*365.25)) +} +} +\references{ +Perme, Maja Pohar, Janez Stare, and Jacques Estève. +"On estimation in relative survival." Biometrics 68.1 (2012): 113-120. + +Hakulinen, Timo, Karri Seppa, and Paul C. Lambert. +"Choosing the relative survival method for cancer survival estimation." +European Journal of Cancer 47.14 (2011): 2202-2210. + +Seppa, Karri, Timo Hakulinen, and Arun Pokhrel. +"Choosing the net survival method for cancer survival estimation." +European Journal of Cancer (2013). + +CHIANG, Chin Long. Introduction to stochastic processes in biostatistics. +1968. + +Seppa K., Dyba T. and Hakulinen T.: Cancer Survival, +Reference Module in Biomedical Sciences. Elsevier. 08-Jan-2015 +doi: 10.1016/B978-0-12-801238-3.02745-8. +} +\seealso{ +\code{\link{splitMulti}}, \code{\link{lexpand}}, +\code{\link{ICSS}}, \code{\link{sire}} +\href{../doc/survtab_examples.html}{The survtab_examples vignette} + +Other main functions: \code{\link{rate}}, + \code{\link{relpois_ag}}, \code{\link{relpois}}, + \code{\link{sirspline}}, \code{\link{sir}}, + \code{\link{survmean}}, \code{\link{survtab_ag}} + +Other survtab functions: \code{\link{lines.survtab}}, + \code{\link{plot.survtab}}, \code{\link{print.survtab}}, + \code{\link{summary.survtab}}, \code{\link{survtab_ag}} +} diff --git a/man/survtab_ag.Rd b/man/survtab_ag.Rd index 5e38104..8627f17 100644 --- a/man/survtab_ag.Rd +++ b/man/survtab_ag.Rd @@ -1,391 +1,391 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/survival_aggregated.R -\name{survtab_ag} -\alias{survtab_ag} -\title{Estimate Survival Time Functions} -\usage{ -survtab_ag(formula = NULL, data, adjust = NULL, weights = NULL, - surv.breaks = NULL, n = "at.risk", d = "from0to1", - n.cens = "from0to0", pyrs = "pyrs", d.exp = "d.exp", n.pp = NULL, - d.pp = "d.pp", d.pp.2 = "d.pp.2", n.cens.pp = "n.cens.pp", - pyrs.pp = "pyrs.pp", d.exp.pp = "d.exp.pp", surv.type = "surv.rel", - surv.method = "hazard", relsurv.method = "e2", subset = NULL, - conf.level = 0.95, conf.type = "log-log", verbose = FALSE) -} -\arguments{ -\item{formula}{a \code{formula}; the response -must be the time scale to compute survival time function estimates -over, e.g. \code{fot ~ sex}. Variables on the right-hand side of the formula -separated by \code{+} are considered stratifying variables, for which -estimates are computed separately. May contain usage of \code{adjust()} ---- see Details and Examples.} - -\item{data}{since popEpi 0.4.0, a \code{data.frame} -containing variables used in \code{formula} and other arguments. -\code{aggre} objects are recommended as they contain information on any -time scales and are therefore safer; for creating \code{aggre} objects see -\code{\link{as.aggre}} when your data is already aggregated and \code{aggre} -for aggregating split \code{Lexis} objects.} - -\item{adjust}{can be used as an alternative to passing variables to -argument \code{formula} within a call to \code{adjust()}; e.g. -\code{adjust = "agegr"}. \link[=flexible_argument]{Flexible input}.} - -\item{weights}{typically a list of weights or a \code{character} string -specifying an age group standardization scheme; see -the \link[=direct_standardization]{dedicated help page} -and examples. NOTE: \code{weights = "internal"} is based on the counts -of persons in follow-up at the start of follow-up (typically T = 0)} - -\item{surv.breaks}{a vector of breaks on the -survival time scale. Optional if \code{data} is an \code{aggre} object -and mandatory otherwise. Must define each intended interval; -e.g. \code{surv.breaks = 0:5} when data has intervals defined by -breaks \code{seq(0, 5, 1/12)} will aggregate to wider intervals first. -It is generally recommended (and sufficient; -see Seppa, Dyban and Hakulinen (2015)) to use monthly -intervals where applicable.} - -\item{n}{variable containing counts of subjects at-risk at the start of a -time interval; e.g. \code{n = "at.risk"}. -Required when \code{surv.method = "lifetable"}. -\link[=flexible_argument]{Flexible input}.} - -\item{d}{variable(s) containing counts of subjects experiencing an event. -With only one type of event, e.g. \code{d = "deaths"}. With multiple types of -events (for CIF or cause-specific survival estimation), supply e.g. -\code{d = c("canD", "othD")}. If the survival time function to be estimated -does not use multiple types of events, supplying more than one variable -to \code{d} simply causes the variables to be added together. -Always required. \link[=flexible_argument]{Flexible input}.} - -\item{n.cens}{variable containing counts of subjects censored during a -survival time interval; E.g. \code{n.cens = "alive"}. -Required when \code{surv.method = "lifetable"}. -\link[=flexible_argument]{Flexible input}.} - -\item{pyrs}{variable containing total subject-time accumulated within a -survival time interval; E.g. \code{pyrs = "pyrs"}. -Required when \code{surv.method = "hazard"}. Flexible input.} - -\item{d.exp}{variable denoting total "expected numbers of events" -(typically computed \code{pyrs * pop.haz}, where -\code{pop.haz} is the expected hazard level) -accumulated within a survival time interval; E.g. \code{pyrs = "pyrs"}. -Required when computing EdererII relative survivals or -CIFs based on excess counts of events. Flexible input.} - -\item{n.pp}{variable containing total Pohar-Perme weighted counts of -subjects at risk in an interval, -supplied as argument \code{n} is supplied. -Computed originally on the subject -level as analogous to \code{pp * as.integer(status == "at-risk")}. -Required when \code{relsurv.method = "pp"}. Flexible input.} - -\item{d.pp}{variable(s) containing Pohar-Perme weighted counts of events, -supplied as argument \code{d} is supplied. Computed originally on the subject -level as analogous to \code{pp * as.integer(status == some_event)}. -Required when \code{relsurv.method = "pp"}. Flexible input.} - -\item{d.pp.2}{variable(s) containing total Pohar-Perme -"double-weighted" counts of events, -supplied as argument \code{d} is supplied. Computed originally on the subject -level as analogous to \code{pp * pp * as.integer(status == some_event)}. -Required when \code{relsurv.method = "pp"}. Flexible input.} - -\item{n.cens.pp}{variable containing total Pohar-Perme weighted counts -censorings, -supplied as argument \code{n.cens} is supplied. -Computed originally on the subject -level as analogous to \code{pp * as.integer(status == "censored")}. -Required when \code{relsurv.method = "pp"}. Flexible input.} - -\item{pyrs.pp}{variable containing total Pohar-Perme weighted subject-times, -supplied as argument \code{pyrs} is supplied. -Computed originally on the subject -level as analogous to \code{pp * pyrs}. -Required when \code{relsurv.method = "pp"}. Flexible input.} - -\item{d.exp.pp}{variable containing total Pohar-Perme weighted counts -of excess events, -supplied as argument \code{pyrs} is supplied. -Computed originally on the subject -level as analogous to \code{pp * d.exp}. -Required when \code{relsurv.method = "pp"}. Flexible input.} - -\item{surv.type}{one of \code{'surv.obs'}, -\code{'surv.cause'}, \code{'surv.rel'}, -\code{'cif.obs'} or \code{'cif.rel'}; -defines what kind of survival time function(s) is/are estimated; see Details} - -\item{surv.method}{either \code{'lifetable'} or \code{'hazard'}; determines -the method of calculating survival time functions, where the former computes -ratios such as \code{p = d/(n - n.cens)} -and the latter utilizes subject-times -(typically person-years) for hazard estimates such as \code{d/pyrs} -which are used to compute survival time function estimates. -The former method requires argument \code{n.cens} and the latter -argument \code{pyrs} to be supplied.} - -\item{relsurv.method}{either \code{'e2'} or \code{'pp'}; -defines whether to compute relative survival using the -EdererII method or using Pohar-Perme weighting; -ignored if \code{surv.type != "surv.rel"}} - -\item{subset}{a logical condition; e.g. \code{subset = sex == 1}; -subsets the data before computations} - -\item{conf.level}{confidence level used in confidence intervals; -e.g. \code{0.95} for 95 percent confidence intervals} - -\item{conf.type}{character string; must be one of \code{"plain"}, -\code{"log-log"} and \code{"log"}; -defines the transformation used on the survival time -function to yield confidence -intervals via the delta method} - -\item{verbose}{logical; if \code{TRUE}, the function is chatty and -returns some messages and timings along the process} -} -\value{ -Returns a table of life time function values and other -information with survival intervals as rows. -Returns some of the following estimates of survival time functions: - -\itemize{ - \item \code{surv.obs} - observed (raw, overall) survival - \item \code{surv.obs.K} - observed cause-specific survival for cause K - \item \code{CIF_k} - cumulative incidence function for cause \code{k} - \item \code{CIF.rel} - cumulative incidence function using excess cases - \item \code{r.e2} - relative survival, EdererII - \item \code{r.pp} - relative survival, Pohar-Perme weighted -} -The suffix \code{.as} implies adjusted estimates, and \code{.lo} and -\code{.hi} imply lower and upper confidence limits, respectively. -The prefix \code{SE.} stands for standard error. -} -\description{ -This function estimates survival time functions: survival, -relative/net survival, and crude/absolute risk functions (CIF). -} -\section{Basics}{ - - -This function computes interval-based estimates of survival time functions, -where the intervals are set by the user. For product-limit-based -estimation see packages \pkg{survival} and \pkg{relsurv}. - -if \code{surv.type = 'surv.obs'}, only 'raw' observed survival -is estimated over the chosen time intervals. With -\code{surv.type = 'surv.rel'}, also relative survival estimates -are supplied in addition to observed survival figures. - -\code{surv.type = 'cif.obs'} requests cumulative incidence functions (CIF) -to be estimated. -CIFs are estimated for each competing risk based -on a survival-interval-specific proportional hazards -assumption as described by Chiang (1968). -With \code{surv.type = 'cif.rel'}, a CIF is estimated with using -excess cases as the ''cause-specific'' cases. Finally, with -\code{surv.type = 'surv.cause'}, cause-specific survivals are -estimated separately for each separate type of event. - -In hazard-based estimation (\code{surv.method = "hazard"}) survival -time functions are transformations of the estimated corresponding hazard -in the intervals. The hazard itself is estimated using counts of events -(or excess events) and total subject-time in the interval. Life table -\code{surv.method = "lifetable"} estimates are constructed as transformations -of probabilities computed using counts of events and counts of subjects -at risk. - - -The vignette \href{../doc/survtab_examples.html}{survtab_examples} -has some practical examples. -} - -\section{Relative survival}{ - - -When \code{surv.type = 'surv.rel'}, the user can choose -\code{relsurv.method = 'pp'}, whereupon Pohar-Perme weighting is used. -By default \code{relsurv.method = 'e2'}, i.e. the Ederer II method -is used to estimate relative survival. -} - -\section{Adjusted estimates}{ - - -Adjusted estimates in this context mean computing estimates separately -by the levels of adjusting variables and returning weighted averages -of the estimates. For example, computing estimates separately by -age groups and returning a weighted average estimate (age-adjusted estimate). - -Adjusting requires specification of both the adjusting variables and -the weights for all the levels of the adjusting variables. The former can be -accomplished by using \code{adjust()} with the argument \code{formula}, -or by supplying variables directly to argument \code{adjust}. E.g. the -following are all equivalent: - -\code{formula = fot ~ sex + adjust(agegr) + adjust(area)} - -\code{formula = fot ~ sex + adjust(agegr, area)} - -\code{formula = fot ~ sex, adjust = c("agegr", "area")} - -\code{formula = fot ~ sex, adjust = list(agegr, area)} - -The adjusting variables must match with the variable names in the -argument \code{weights}; -see the \link[=direct_standardization]{dedicated help page}. -Typically weights are supplied as a \code{list} or -a \code{data.frame}. The former can be done by e.g. - -\code{weights = list(agegr = VEC1, area = VEC2)}, - -where \code{VEC1} and \code{VEC2} are vectors of weights (which do not -have to add up to one). See -\href{../doc/survtab_examples.html}{survtab_examples} -for an example of using a \code{data.frame} to pass weights. -} - -\section{Period analysis and other data selection schemes}{ - - -To calculate e.g. period analysis (delayed entry) estimates, -limit the data when/before supplying to this function.See -\href{../doc/survtab_examples.html}{survtab_examples}. -} - -\section{Data requirements}{ - - -\code{survtab_ag} computes estimates of survival time functions using -pre-aggregated data. For using subject-level data directly, use -\code{\link{survtab}}. For aggregating data, see \code{\link{lexpand}} -and \code{\link{aggre}}. - -By default, and if data is an \code{aggre} object (not mandatory), -\code{survtab_ag} makes use of the exact same breaks that were used in -splitting the original data (with e.g. \code{lexpand}), so it is not -necessary to specify any \code{surv.breaks}. If specified, the -\code{surv.breaks} must be a subset of the pertinent -pre-existing breaks. When data is not an \code{aggre} object, breaks -must always be specified. Interval lengths (\code{delta} in output) are -also calculated based on whichever breaks are used, -so the upper limit of the breaks should -therefore be meaningful and never e.g. \code{Inf}. -} - -\examples{ -## see more examples with explanations in vignette("survtab_examples") - -#### survtab_ag usage - -data("sire", package = "popEpi") -## prepare data for e.g. 5-year "period analysis" for 2008-2012 -## note: sire is a simulated cohort integrated into popEpi. -BL <- list(fot=seq(0, 5, by = 1/12), - per = c("2008-01-01", "2013-01-01")) -x <- lexpand(sire, birth = bi_date, entry = dg_date, exit = ex_date, - status = status \%in\% 1:2, - breaks = BL, - pophaz = popmort, - aggre = list(fot)) - -## calculate relative EdererII period method -## NOTE: x is an aggre object here, so surv.breaks are deduced -## automatically -st <- survtab_ag(fot ~ 1, data = x) - -summary(st, t = 1:5) ## annual estimates -summary(st, q = list(r.e2 = 0.75)) ## 1st interval where r.e2 < 0.75 at end -\dontrun{ -plot(st) - - -## non-aggre data: first call to survtab_ag would fail -df <- data.frame(x) -# st <- survtab_ag(fot ~ 1, data = x) -st <- survtab_ag(fot ~ 1, data = x, surv.breaks = BL$fot) - -## calculate age-standardised 5-year relative survival ratio using -## Ederer II method and period approach - -sire$agegr <- cut(sire$dg_age,c(0,45,55,65,75,Inf),right=F) -BL <- list(fot=seq(0, 5, by = 1/12), - per = c("2008-01-01", "2013-01-01")) -x <- lexpand(sire, birth = bi_date, entry = dg_date, exit = ex_date, - status = status \%in\% 1:2, - breaks = BL, - pophaz = popmort, - aggre = list(agegr, fot)) - -## age standardisation using internal weights (age distribution of -## patients diagnosed within the period window) -## (NOTE: what is done here is equivalent to using weights = "internal") -w <- aggregate(at.risk ~ agegr, data = x[x$fot == 0], FUN = sum) -names(w) <- c("agegr", "weights") - -st <- survtab_ag(fot ~ adjust(agegr), data = x, weights = w) -plot(st, y = "r.e2.as", col = c("blue")) - -## age standardisation using ICSS1 weights -data(ICSS) -cut <- c(0, 45, 55, 65, 75, Inf) -agegr <- cut(ICSS$age, cut, right = FALSE) -w <- aggregate(ICSS1~agegr, data = ICSS, FUN = sum) -names(w) <- c("agegr", "weights") - -st <- survtab_ag(fot ~ adjust(agegr), data = x, weights = w) -lines(st, y = "r.e2.as", col = c("red")) - - -## cause-specific survival -sire$stat <- factor(sire$status, 0:2, c("alive", "canD", "othD")) -x <- lexpand(sire, birth = bi_date, entry = dg_date, exit = ex_date, - status = stat, - breaks = BL, - pophaz = popmort, - aggre = list(agegr, fot)) -st <- survtab_ag(fot ~ adjust(agegr), data = x, weights = w, - d = c("fromalivetocanD", "fromalivetoothD"), - surv.type = "surv.cause") -plot(st, y = "surv.obs.fromalivetocanD.as") -lines(st, y = "surv.obs.fromalivetoothD.as", col = "red") - - -} -} -\references{ -Perme, Maja Pohar, Janez Stare, and Jacques Estève. -"On estimation in relative survival." Biometrics 68.1 (2012): 113-120. - -Hakulinen, Timo, Karri Seppa, and Paul C. Lambert. -"Choosing the relative survival method for cancer survival estimation." -European Journal of Cancer 47.14 (2011): 2202-2210. - -Seppa, Karri, Timo Hakulinen, and Arun Pokhrel. -"Choosing the net survival method for cancer survival estimation." -European Journal of Cancer (2013). - -CHIANG, Chin Long. Introduction to stochastic processes in biostatistics. -1968. - -Seppa K., Dyba T. and Hakulinen T.: Cancer Survival, -Reference Module in Biomedical Sciences. Elsevier. 08-Jan-2015 -doi: 10.1016/B978-0-12-801238-3.02745-8. -} -\seealso{ -\code{\link{splitMulti}}, \code{\link{lexpand}}, -\code{\link{ICSS}}, \code{\link{sire}} -\href{../doc/survtab_examples.html}{The survtab_examples vignette} - -Other main functions: \code{\link{rate}}, - \code{\link{relpois_ag}}, \code{\link{relpois}}, - \code{\link{sirspline}}, \code{\link{sir}}, - \code{\link{survmean}}, \code{\link{survtab}} - -Other survtab functions: \code{\link{lines.survtab}}, - \code{\link{plot.survtab}}, \code{\link{print.survtab}}, - \code{\link{summary.survtab}}, \code{\link{survtab}} -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/survival_aggregated.R +\name{survtab_ag} +\alias{survtab_ag} +\title{Estimate Survival Time Functions} +\usage{ +survtab_ag(formula = NULL, data, adjust = NULL, weights = NULL, + surv.breaks = NULL, n = "at.risk", d = "from0to1", + n.cens = "from0to0", pyrs = "pyrs", d.exp = "d.exp", n.pp = NULL, + d.pp = "d.pp", d.pp.2 = "d.pp.2", n.cens.pp = "n.cens.pp", + pyrs.pp = "pyrs.pp", d.exp.pp = "d.exp.pp", surv.type = "surv.rel", + surv.method = "hazard", relsurv.method = "e2", subset = NULL, + conf.level = 0.95, conf.type = "log-log", verbose = FALSE) +} +\arguments{ +\item{formula}{a \code{formula}; the response +must be the time scale to compute survival time function estimates +over, e.g. \code{fot ~ sex}. Variables on the right-hand side of the formula +separated by \code{+} are considered stratifying variables, for which +estimates are computed separately. May contain usage of \code{adjust()} +--- see Details and Examples.} + +\item{data}{since popEpi 0.4.0, a \code{data.frame} +containing variables used in \code{formula} and other arguments. +\code{aggre} objects are recommended as they contain information on any +time scales and are therefore safer; for creating \code{aggre} objects see +\code{\link{as.aggre}} when your data is already aggregated and \code{aggre} +for aggregating split \code{Lexis} objects.} + +\item{adjust}{can be used as an alternative to passing variables to +argument \code{formula} within a call to \code{adjust()}; e.g. +\code{adjust = "agegr"}. \link[=flexible_argument]{Flexible input}.} + +\item{weights}{typically a list of weights or a \code{character} string +specifying an age group standardization scheme; see +the \link[=direct_standardization]{dedicated help page} +and examples. NOTE: \code{weights = "internal"} is based on the counts +of persons in follow-up at the start of follow-up (typically T = 0)} + +\item{surv.breaks}{a vector of breaks on the +survival time scale. Optional if \code{data} is an \code{aggre} object +and mandatory otherwise. Must define each intended interval; +e.g. \code{surv.breaks = 0:5} when data has intervals defined by +breaks \code{seq(0, 5, 1/12)} will aggregate to wider intervals first. +It is generally recommended (and sufficient; +see Seppa, Dyban and Hakulinen (2015)) to use monthly +intervals where applicable.} + +\item{n}{variable containing counts of subjects at-risk at the start of a +time interval; e.g. \code{n = "at.risk"}. +Required when \code{surv.method = "lifetable"}. +\link[=flexible_argument]{Flexible input}.} + +\item{d}{variable(s) containing counts of subjects experiencing an event. +With only one type of event, e.g. \code{d = "deaths"}. With multiple types of +events (for CIF or cause-specific survival estimation), supply e.g. +\code{d = c("canD", "othD")}. If the survival time function to be estimated +does not use multiple types of events, supplying more than one variable +to \code{d} simply causes the variables to be added together. +Always required. \link[=flexible_argument]{Flexible input}.} + +\item{n.cens}{variable containing counts of subjects censored during a +survival time interval; E.g. \code{n.cens = "alive"}. +Required when \code{surv.method = "lifetable"}. +\link[=flexible_argument]{Flexible input}.} + +\item{pyrs}{variable containing total subject-time accumulated within a +survival time interval; E.g. \code{pyrs = "pyrs"}. +Required when \code{surv.method = "hazard"}. Flexible input.} + +\item{d.exp}{variable denoting total "expected numbers of events" +(typically computed \code{pyrs * pop.haz}, where +\code{pop.haz} is the expected hazard level) +accumulated within a survival time interval; E.g. \code{pyrs = "pyrs"}. +Required when computing EdererII relative survivals or +CIFs based on excess counts of events. Flexible input.} + +\item{n.pp}{variable containing total Pohar-Perme weighted counts of +subjects at risk in an interval, +supplied as argument \code{n} is supplied. +Computed originally on the subject +level as analogous to \code{pp * as.integer(status == "at-risk")}. +Required when \code{relsurv.method = "pp"}. Flexible input.} + +\item{d.pp}{variable(s) containing Pohar-Perme weighted counts of events, +supplied as argument \code{d} is supplied. Computed originally on the subject +level as analogous to \code{pp * as.integer(status == some_event)}. +Required when \code{relsurv.method = "pp"}. Flexible input.} + +\item{d.pp.2}{variable(s) containing total Pohar-Perme +"double-weighted" counts of events, +supplied as argument \code{d} is supplied. Computed originally on the subject +level as analogous to \code{pp * pp * as.integer(status == some_event)}. +Required when \code{relsurv.method = "pp"}. Flexible input.} + +\item{n.cens.pp}{variable containing total Pohar-Perme weighted counts +censorings, +supplied as argument \code{n.cens} is supplied. +Computed originally on the subject +level as analogous to \code{pp * as.integer(status == "censored")}. +Required when \code{relsurv.method = "pp"}. Flexible input.} + +\item{pyrs.pp}{variable containing total Pohar-Perme weighted subject-times, +supplied as argument \code{pyrs} is supplied. +Computed originally on the subject +level as analogous to \code{pp * pyrs}. +Required when \code{relsurv.method = "pp"}. Flexible input.} + +\item{d.exp.pp}{variable containing total Pohar-Perme weighted counts +of excess events, +supplied as argument \code{pyrs} is supplied. +Computed originally on the subject +level as analogous to \code{pp * d.exp}. +Required when \code{relsurv.method = "pp"}. Flexible input.} + +\item{surv.type}{one of \code{'surv.obs'}, +\code{'surv.cause'}, \code{'surv.rel'}, +\code{'cif.obs'} or \code{'cif.rel'}; +defines what kind of survival time function(s) is/are estimated; see Details} + +\item{surv.method}{either \code{'lifetable'} or \code{'hazard'}; determines +the method of calculating survival time functions, where the former computes +ratios such as \code{p = d/(n - n.cens)} +and the latter utilizes subject-times +(typically person-years) for hazard estimates such as \code{d/pyrs} +which are used to compute survival time function estimates. +The former method requires argument \code{n.cens} and the latter +argument \code{pyrs} to be supplied.} + +\item{relsurv.method}{either \code{'e2'} or \code{'pp'}; +defines whether to compute relative survival using the +EdererII method or using Pohar-Perme weighting; +ignored if \code{surv.type != "surv.rel"}} + +\item{subset}{a logical condition; e.g. \code{subset = sex == 1}; +subsets the data before computations} + +\item{conf.level}{confidence level used in confidence intervals; +e.g. \code{0.95} for 95 percent confidence intervals} + +\item{conf.type}{character string; must be one of \code{"plain"}, +\code{"log-log"} and \code{"log"}; +defines the transformation used on the survival time +function to yield confidence +intervals via the delta method} + +\item{verbose}{logical; if \code{TRUE}, the function is chatty and +returns some messages and timings along the process} +} +\value{ +Returns a table of life time function values and other +information with survival intervals as rows. +Returns some of the following estimates of survival time functions: + +\itemize{ + \item \code{surv.obs} - observed (raw, overall) survival + \item \code{surv.obs.K} - observed cause-specific survival for cause K + \item \code{CIF_k} - cumulative incidence function for cause \code{k} + \item \code{CIF.rel} - cumulative incidence function using excess cases + \item \code{r.e2} - relative survival, EdererII + \item \code{r.pp} - relative survival, Pohar-Perme weighted +} +The suffix \code{.as} implies adjusted estimates, and \code{.lo} and +\code{.hi} imply lower and upper confidence limits, respectively. +The prefix \code{SE.} stands for standard error. +} +\description{ +This function estimates survival time functions: survival, +relative/net survival, and crude/absolute risk functions (CIF). +} +\section{Basics}{ + + +This function computes interval-based estimates of survival time functions, +where the intervals are set by the user. For product-limit-based +estimation see packages \pkg{survival} and \pkg{relsurv}. + +if \code{surv.type = 'surv.obs'}, only 'raw' observed survival +is estimated over the chosen time intervals. With +\code{surv.type = 'surv.rel'}, also relative survival estimates +are supplied in addition to observed survival figures. + +\code{surv.type = 'cif.obs'} requests cumulative incidence functions (CIF) +to be estimated. +CIFs are estimated for each competing risk based +on a survival-interval-specific proportional hazards +assumption as described by Chiang (1968). +With \code{surv.type = 'cif.rel'}, a CIF is estimated with using +excess cases as the ''cause-specific'' cases. Finally, with +\code{surv.type = 'surv.cause'}, cause-specific survivals are +estimated separately for each separate type of event. + +In hazard-based estimation (\code{surv.method = "hazard"}) survival +time functions are transformations of the estimated corresponding hazard +in the intervals. The hazard itself is estimated using counts of events +(or excess events) and total subject-time in the interval. Life table +\code{surv.method = "lifetable"} estimates are constructed as transformations +of probabilities computed using counts of events and counts of subjects +at risk. + + +The vignette \href{../doc/survtab_examples.html}{survtab_examples} +has some practical examples. +} + +\section{Relative survival}{ + + +When \code{surv.type = 'surv.rel'}, the user can choose +\code{relsurv.method = 'pp'}, whereupon Pohar-Perme weighting is used. +By default \code{relsurv.method = 'e2'}, i.e. the Ederer II method +is used to estimate relative survival. +} + +\section{Adjusted estimates}{ + + +Adjusted estimates in this context mean computing estimates separately +by the levels of adjusting variables and returning weighted averages +of the estimates. For example, computing estimates separately by +age groups and returning a weighted average estimate (age-adjusted estimate). + +Adjusting requires specification of both the adjusting variables and +the weights for all the levels of the adjusting variables. The former can be +accomplished by using \code{adjust()} with the argument \code{formula}, +or by supplying variables directly to argument \code{adjust}. E.g. the +following are all equivalent: + +\code{formula = fot ~ sex + adjust(agegr) + adjust(area)} + +\code{formula = fot ~ sex + adjust(agegr, area)} + +\code{formula = fot ~ sex, adjust = c("agegr", "area")} + +\code{formula = fot ~ sex, adjust = list(agegr, area)} + +The adjusting variables must match with the variable names in the +argument \code{weights}; +see the \link[=direct_standardization]{dedicated help page}. +Typically weights are supplied as a \code{list} or +a \code{data.frame}. The former can be done by e.g. + +\code{weights = list(agegr = VEC1, area = VEC2)}, + +where \code{VEC1} and \code{VEC2} are vectors of weights (which do not +have to add up to one). See +\href{../doc/survtab_examples.html}{survtab_examples} +for an example of using a \code{data.frame} to pass weights. +} + +\section{Period analysis and other data selection schemes}{ + + +To calculate e.g. period analysis (delayed entry) estimates, +limit the data when/before supplying to this function.See +\href{../doc/survtab_examples.html}{survtab_examples}. +} + +\section{Data requirements}{ + + +\code{survtab_ag} computes estimates of survival time functions using +pre-aggregated data. For using subject-level data directly, use +\code{\link{survtab}}. For aggregating data, see \code{\link{lexpand}} +and \code{\link{aggre}}. + +By default, and if data is an \code{aggre} object (not mandatory), +\code{survtab_ag} makes use of the exact same breaks that were used in +splitting the original data (with e.g. \code{lexpand}), so it is not +necessary to specify any \code{surv.breaks}. If specified, the +\code{surv.breaks} must be a subset of the pertinent +pre-existing breaks. When data is not an \code{aggre} object, breaks +must always be specified. Interval lengths (\code{delta} in output) are +also calculated based on whichever breaks are used, +so the upper limit of the breaks should +therefore be meaningful and never e.g. \code{Inf}. +} + +\examples{ +## see more examples with explanations in vignette("survtab_examples") + +#### survtab_ag usage + +data("sire", package = "popEpi") +## prepare data for e.g. 5-year "period analysis" for 2008-2012 +## note: sire is a simulated cohort integrated into popEpi. +BL <- list(fot=seq(0, 5, by = 1/12), + per = c("2008-01-01", "2013-01-01")) +x <- lexpand(sire, birth = bi_date, entry = dg_date, exit = ex_date, + status = status \%in\% 1:2, + breaks = BL, + pophaz = popmort, + aggre = list(fot)) + +## calculate relative EdererII period method +## NOTE: x is an aggre object here, so surv.breaks are deduced +## automatically +st <- survtab_ag(fot ~ 1, data = x) + +summary(st, t = 1:5) ## annual estimates +summary(st, q = list(r.e2 = 0.75)) ## 1st interval where r.e2 < 0.75 at end +\dontrun{ +plot(st) + + +## non-aggre data: first call to survtab_ag would fail +df <- data.frame(x) +# st <- survtab_ag(fot ~ 1, data = x) +st <- survtab_ag(fot ~ 1, data = x, surv.breaks = BL$fot) + +## calculate age-standardised 5-year relative survival ratio using +## Ederer II method and period approach + +sire$agegr <- cut(sire$dg_age,c(0,45,55,65,75,Inf),right=F) +BL <- list(fot=seq(0, 5, by = 1/12), + per = c("2008-01-01", "2013-01-01")) +x <- lexpand(sire, birth = bi_date, entry = dg_date, exit = ex_date, + status = status \%in\% 1:2, + breaks = BL, + pophaz = popmort, + aggre = list(agegr, fot)) + +## age standardisation using internal weights (age distribution of +## patients diagnosed within the period window) +## (NOTE: what is done here is equivalent to using weights = "internal") +w <- aggregate(at.risk ~ agegr, data = x[x$fot == 0], FUN = sum) +names(w) <- c("agegr", "weights") + +st <- survtab_ag(fot ~ adjust(agegr), data = x, weights = w) +plot(st, y = "r.e2.as", col = c("blue")) + +## age standardisation using ICSS1 weights +data(ICSS) +cut <- c(0, 45, 55, 65, 75, Inf) +agegr <- cut(ICSS$age, cut, right = FALSE) +w <- aggregate(ICSS1~agegr, data = ICSS, FUN = sum) +names(w) <- c("agegr", "weights") + +st <- survtab_ag(fot ~ adjust(agegr), data = x, weights = w) +lines(st, y = "r.e2.as", col = c("red")) + + +## cause-specific survival +sire$stat <- factor(sire$status, 0:2, c("alive", "canD", "othD")) +x <- lexpand(sire, birth = bi_date, entry = dg_date, exit = ex_date, + status = stat, + breaks = BL, + pophaz = popmort, + aggre = list(agegr, fot)) +st <- survtab_ag(fot ~ adjust(agegr), data = x, weights = w, + d = c("fromalivetocanD", "fromalivetoothD"), + surv.type = "surv.cause") +plot(st, y = "surv.obs.fromalivetocanD.as") +lines(st, y = "surv.obs.fromalivetoothD.as", col = "red") + + +} +} +\references{ +Perme, Maja Pohar, Janez Stare, and Jacques Estève. +"On estimation in relative survival." Biometrics 68.1 (2012): 113-120. + +Hakulinen, Timo, Karri Seppa, and Paul C. Lambert. +"Choosing the relative survival method for cancer survival estimation." +European Journal of Cancer 47.14 (2011): 2202-2210. + +Seppa, Karri, Timo Hakulinen, and Arun Pokhrel. +"Choosing the net survival method for cancer survival estimation." +European Journal of Cancer (2013). + +CHIANG, Chin Long. Introduction to stochastic processes in biostatistics. +1968. + +Seppa K., Dyba T. and Hakulinen T.: Cancer Survival, +Reference Module in Biomedical Sciences. Elsevier. 08-Jan-2015 +doi: 10.1016/B978-0-12-801238-3.02745-8. +} +\seealso{ +\code{\link{splitMulti}}, \code{\link{lexpand}}, +\code{\link{ICSS}}, \code{\link{sire}} +\href{../doc/survtab_examples.html}{The survtab_examples vignette} + +Other main functions: \code{\link{rate}}, + \code{\link{relpois_ag}}, \code{\link{relpois}}, + \code{\link{sirspline}}, \code{\link{sir}}, + \code{\link{survmean}}, \code{\link{survtab}} + +Other survtab functions: \code{\link{lines.survtab}}, + \code{\link{plot.survtab}}, \code{\link{print.survtab}}, + \code{\link{summary.survtab}}, \code{\link{survtab}} +} diff --git a/man/try2int.Rd b/man/try2int.Rd index 64720e1..546d1e9 100644 --- a/man/try2int.Rd +++ b/man/try2int.Rd @@ -1,27 +1,27 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utility_functions.R -\name{try2int} -\alias{try2int} -\title{Attempt coercion to integer} -\source{ -\href{http://stackoverflow.com/questions/3476782/how-to-check-if-the-number-is-integer}{Stackoverflow thread} -} -\usage{ -try2int(obj, tol = .Machine$double.eps^0.5) -} -\arguments{ -\item{obj}{a numeric vector} - -\item{tol}{tolerance; if each numeric value in \code{obj} deviate from -the corresponding integers at most the value of \code{tol}, they are considered -to be integers; e.g. by default \code{1 + .Machine$double.eps} is considered -to be an integer but \code{1 + .Machine$double.eps^0.49} is not.} -} -\description{ -Attempts to convert a numeric object to integer, -but won't if loss of information is imminent (if values after decimal -are not zero for even one value in \code{obj}) -} -\author{ -James Arnold -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utility_functions.R +\name{try2int} +\alias{try2int} +\title{Attempt coercion to integer} +\source{ +\href{http://stackoverflow.com/questions/3476782/how-to-check-if-the-number-is-integer}{Stackoverflow thread} +} +\usage{ +try2int(obj, tol = .Machine$double.eps^0.5) +} +\arguments{ +\item{obj}{a numeric vector} + +\item{tol}{tolerance; if each numeric value in \code{obj} deviate from +the corresponding integers at most the value of \code{tol}, they are considered +to be integers; e.g. by default \code{1 + .Machine$double.eps} is considered +to be an integer but \code{1 + .Machine$double.eps^0.49} is not.} +} +\description{ +Attempts to convert a numeric object to integer, +but won't if loss of information is imminent (if values after decimal +are not zero for even one value in \code{obj}) +} +\author{ +James Arnold +} diff --git a/tests/testthat.R b/tests/testthat.R index f4c002a..40e5bd8 100644 --- a/tests/testthat.R +++ b/tests/testthat.R @@ -1,4 +1,13 @@ -library(testthat) -library(popEpi) -test_check("popEpi") - +library("testthat") +library("popEpi") +library("data.table") + +using_r_devel <- grepl(pattern = "devel", x = R.version$status) +if (using_r_devel) { + ## memory leak problem in data.table 1.11.2 in R-devel (3.6.0 atm) + requireNamespace("data.table") + data.table::setDTthreads(threads = 1L) +} + +test_check("popEpi") + diff --git a/tests/testthat/test_aggre.R b/tests/testthat/test_aggre.R index a3d3963..640c911 100644 --- a/tests/testthat/test_aggre.R +++ b/tests/testthat/test_aggre.R @@ -1,227 +1,227 @@ -context("aggre") - -test_that("aggre leaves original data untouched", { - - x <- sire[1:100,] - BL <- list(fot= seq(0,20,1/12), age= c(0:100, Inf), per= c(1960:2014)) - x <- lexpand(x, birth = bi_date, entry = dg_date, exit = ex_date, - status = status %in% 1:2, breaks=BL) - - ## scramble order - set.seed(1L) - x <- x[sample(x = .N, size = .N, replace = FALSE)] - setkeyv(x, NULL) - - setDT(x) - forceLexisDT(x, breaks = BL, allScales = c("fot", "per", "age"), key = FALSE) - - xor <- copy(x) - - ag1 <- aggre(x, by = list(gender = factor(sex, 1, "f"), sex, surv.int = fot, per, agegr = age)) - - expect_identical(x, xor) -}) - -test_that("aggre works with by = NULL", { - - sr <- popEpi::sire[dg_date < ex_date,][1:1000,] - - BL <- list(fot= seq(0,20,1), age= c(0:100, Inf), per= c(1960:2014)) - x <- lexpand(sr, birth = bi_date, entry = dg_date, exit = ex_date, - status = status %in% 1:2, breaks=BL) - - ag1 <- aggre(x, by = NULL) - expect_equal(as.numeric(ag1), c(9539.1903286174274, 1000, 373, 627)) - -}) - -test_that("aggre and lexpand produce the same results", { - sr <- popEpi::sire[dg_date < ex_date,][1:1000,] - - BL <- list(fot= seq(0,20,1/12), age= c(0:100, Inf), per= c(1960:2014)) - x <- lexpand(sr, birth = bi_date, entry = dg_date, exit = ex_date, - status = status %in% 1:2, breaks=BL) - if (!is.data.table(x)) setDF2DT(x) - - e <- quote(list(gender = factor(sex, 1, "f"), sex, surv.int = fot, per, agegr = age)) - v <- c("gender", "sex", "sex", "surv.int", "per", "agegr") - - forceLexisDT(x, breaks = BL, allScales = c("fot", "per", "age")) - x2 <- aggre(x, by = e, verbose = FALSE) - x3 <- aggre(x, by = e, type = "full", verbose = FALSE) - x4 <- lexpand(sr, birth = bi_date, entry = dg_date, exit = ex_date, - status = status %in% 1:2, aggre.type = "non-empty", - breaks=BL, aggre = list(gender = factor(sex, 1, "f"), sex, surv.int = fot, per, agegr = age)) - x5 <- lexpand(sr, birth = bi_date, entry = dg_date, exit = ex_date, - status = status %in% 1:2, aggre.type = "cartesian", - breaks=BL, aggre = list(gender = factor(sex, 1, "f"), sex, surv.int = fot, per, agegr = age)) - - x[, fot := popEpi:::cutLow(fot, BL$fot)] - x[, age := popEpi:::cutLow(age, BL$age)] - x[, per := popEpi:::cutLow(per, BL$per)] - - x <- x[, list(pyrs = sum(lex.dur), obs = sum(lex.Xst)), keyby = e] - x <- x[pyrs > 0 & !is.na(pyrs)] - - if (!is.data.table(x2)) setDF2DT(x2) - if (!is.data.table(x3)) setDF2DT(x3) - if (!is.data.table(x4)) setDF2DT(x4) - if (!is.data.table(x5)) setDF2DT(x5) - - setkeyv(x, v) - setkeyv(x2, v) - setkeyv(x3, v) - setkeyv(x4, v) - setkeyv(x5, v) - - expect_equal(x2$pyrs, x$pyrs, tolerance = 1e-05) - expect_equal(x2$from0to1, x$obs, tolerance = 1e-05) - - expect_equal(sum(x2$pyrs), sum(x3$pyrs), tolerance = 1e-05) - expect_equal(sum(x2$from0to1), sum(x3$from0to1), tolerance = 1e-05) - - expect_equal(sum(x2$pyrs), sum(x4$pyrs), tolerance = 1e-05) - expect_equal(sum(x2$from0to1), sum(x4$from0to1), tolerance = 1e-05) - - expect_equal(x3$pyrs, x5$pyrs, tolerance = 1e-05) - expect_equal(x3$from0to0, x5$from0to0, tolerance = 1e-05) - expect_equal(sum(x3$from0to1), sum(x5$from0to1), tolerance = 1e-05) - - expect_equal(x2$pyrs, x4$pyrs, tolerance = 1e-05) - expect_equal(x2$from0to0, x4$from0to0, tolerance = 1e-05) - expect_equal(sum(x2$from0to1), sum(x4$from0to1), tolerance = 1e-05) -}) - - -test_that("aggre()'s by argument works flexibly", { - library(Epi) - BL <- list(fot = 0:5, per = c(1995,2015)) - for (cond in c(FALSE, TRUE)) { - x <- Lexis(data = sire[dg_date < ex_date,][1:500, ], entry = list(fot = 0, age = dg_age, per = get.yrs(dg_date)), - exit = list(per = get.yrs(ex_date)), exit.status = status, - entry.status = 0) - x <- splitMulti(x, breaks = BL) - setDF(x) - setattr(x, "class", c("Lexis", "data.frame")) - x$agegr <- cut(x$dg_age, 2) - if (cond) { - forceLexisDT(x, breaks = BL, allScales = c("fot", "per", "age")) - alloc.col(x) - } - - a <- aggre(x, by = list(agegr = cut(dg_age, 2), sex, fot, per = per), type = "unique") - b <- aggre(x, by = c("agegr", "sex", "fot", "per"), type = "unique") - - expect_equal(a, b) - - a <- aggre(x, by = cut(dg_age, 2), type = "unique") - setnames(a, "cut", "agegr") - attr(a, "aggre.meta")$by <- "agegr" - b <- aggre(x, by = c("agegr"), type = "unique") - c <- aggre(x, by = list(agegr = cut(dg_age, 2)), type = "unique") - d<- aggre(x, by = agegr, type = "unique") - - expect_equal(a, b) - expect_equal(b, c) - expect_equal(c, d) - } - - -}) - -test_that("subset argument works properly", { - - - x <- sire[dg_date < ex_date, ][1:1000,] - BL <- list(fot= seq(0,20,1/12), age= c(0:100, Inf), per= c(1960:2014)) - x <- lexpand(x, birth = bi_date, entry = dg_date, exit = ex_date, - status = status %in% 1:2, breaks=BL) - # setDT2DF(x) - x2 <- x[x$dg_age <= 55L, ] - - setDT(x) - setDT(x2) - forceLexisDT(x, breaks = BL, allScales = c("fot", "per", "age"), key = FALSE) - forceLexisDT(x2, breaks = BL, allScales = c("fot", "per", "age"), key = FALSE) - - ag <- quote(list(gender = factor(sex, 1, "f"), sex, surv.int = fot, per, agegr = age)) - ag1 <- aggre(x, by = ag, subset = dg_age <= 55L) - ag2 <- aggre(x2, by = ag) - - ag3 <- aggre(x, by = ag, type = "full", subset = dg_age <= 55L) - ag4 <- aggre(x2, by = ag, type = "full") - - expect_identical(ag1, ag2) - expect_identical(ag3, ag4) - -}) - - -test_that("at.risk column works as intended", { - ## normal case - no late entry. Just lots of breaks. - skip_on_cran() - x <- sire[dg_date < ex_date, ][1:1000,] - BL <- list(fot= seq(0,20,1/12), age= c(0:100, Inf), per= c(1960:2014)) - - x <- Lexis(data = x, - entry = list(fot = 0, age = dg_age, per = get.yrs(dg_date)), - exit = list(per = get.yrs(ex_date)), exit.status = status, - entry.status = 0) - - x <- splitMulti(x, breaks = BL, drop = TRUE) - - ag <- aggre(x, by = list(sex, fot)) - setkey(ag, sex, fot) - - ## total events and changes in at.risk should be congruent here - ag[, ndiff := at.risk - c(at.risk[-1], NA), by = list(sex)] - ag[!is.na(ndiff), events := from0to0 + from0to1 + from0to2] - - expect_equal(ag$ndiff, ag$events) - - ## compare at.risk with manually computed at.risk and events - x[, evented := detectEvents(x, breaks = attr(x, "breaks"), by = "lex.id") != 0L] - x[, normalEntry := fot %in% BL$fot] - x[, cutFot := cutLow(fot, BL$fot)] - byDT <- CJ(sex = 1, - cutFot = BL$fot[-length(BL$fot)]) - n.start <- x[byDT, .(sum(normalEntry & !duplicated(lex.id)), - sum(evented)), by = .EACHI, - on = names(byDT)] - n.start[is.na(ag$ndiff), V2 := NA] - expect_equal(ag$at.risk, n.start$V1) - expect_equal(ag$ndiff, n.start$V2) -}) - - - -test_that("at.risk column works as intended, Vol. 2", { - skip_on_cran() - ## period analysis case - some observations are late entry. - data(sire) - - BL <- list(fot=seq(0, 5, by = 1/12), - per = c(2008,2013)) - - x <- Lexis(data = sire[dg_date < ex_date,], - entry = list(fot = 0, age = dg_age, per = get.yrs(dg_date)), - exit = list(per = get.yrs(ex_date)), exit.status = status, - entry.status = 0) - - x <- splitMulti(x, breaks = BL, drop = TRUE) - - a <- aggre(x, by = list(sex, per, fot)) - setkey(a, sex, per, fot) - a[, ndiff := at.risk - c(at.risk[-1], NA), by = list(sex, per)] - a[!is.na(ndiff), events := from0to0 + from0to1 + from0to2] - - x[, normalEntry := fot %in% BL$fot] - x[, cutPer := cutLow(per, BL$per)] - x[, cutFot := cutLow(fot, BL$fot)] - byDT <- CJ(sex = 1, cutPer = BL$per[-length(BL$per)], - cutFot = BL$fot[-length(BL$fot)]) - n.start <- x[byDT, sum(normalEntry & !duplicated(lex.id)), by = .EACHI, - on = names(byDT)] - - expect_equal(a$at.risk, n.start$V1) -}) +context("aggre") + +test_that("aggre leaves original data untouched", { + + x <- sire[1:100,] + BL <- list(fot= seq(0,20,1/12), age= c(0:100, Inf), per= c(1960:2014)) + x <- lexpand(x, birth = bi_date, entry = dg_date, exit = ex_date, + status = status %in% 1:2, breaks=BL) + + ## scramble order + set.seed(1L) + x <- x[sample(x = .N, size = .N, replace = FALSE)] + setkeyv(x, NULL) + + setDT(x) + forceLexisDT(x, breaks = BL, allScales = c("fot", "per", "age"), key = FALSE) + + xor <- copy(x) + + ag1 <- aggre(x, by = list(gender = factor(sex, 1, "f"), sex, surv.int = fot, per, agegr = age)) + + expect_identical(x, xor) +}) + +test_that("aggre works with by = NULL", { + + sr <- popEpi::sire[dg_date < ex_date,][1:1000,] + + BL <- list(fot= seq(0,20,1), age= c(0:100, Inf), per= c(1960:2014)) + x <- lexpand(sr, birth = bi_date, entry = dg_date, exit = ex_date, + status = status %in% 1:2, breaks=BL) + + ag1 <- aggre(x, by = NULL) + expect_equal(as.numeric(ag1), c(9539.1903286174274, 1000, 373, 627)) + +}) + +test_that("aggre and lexpand produce the same results", { + sr <- popEpi::sire[dg_date < ex_date,][1:1000,] + + BL <- list(fot= seq(0,20,1/12), age= c(0:100, Inf), per= c(1960:2014)) + x <- lexpand(sr, birth = bi_date, entry = dg_date, exit = ex_date, + status = status %in% 1:2, breaks=BL) + if (!is.data.table(x)) setDF2DT(x) + + e <- quote(list(gender = factor(sex, 1, "f"), sex, surv.int = fot, per, agegr = age)) + v <- c("gender", "sex", "sex", "surv.int", "per", "agegr") + + forceLexisDT(x, breaks = BL, allScales = c("fot", "per", "age")) + x2 <- aggre(x, by = e, verbose = FALSE) + x3 <- aggre(x, by = e, type = "full", verbose = FALSE) + x4 <- lexpand(sr, birth = bi_date, entry = dg_date, exit = ex_date, + status = status %in% 1:2, aggre.type = "non-empty", + breaks=BL, aggre = list(gender = factor(sex, 1, "f"), sex, surv.int = fot, per, agegr = age)) + x5 <- lexpand(sr, birth = bi_date, entry = dg_date, exit = ex_date, + status = status %in% 1:2, aggre.type = "cartesian", + breaks=BL, aggre = list(gender = factor(sex, 1, "f"), sex, surv.int = fot, per, agegr = age)) + + x[, fot := popEpi:::cutLow(fot, BL$fot)] + x[, age := popEpi:::cutLow(age, BL$age)] + x[, per := popEpi:::cutLow(per, BL$per)] + + x <- x[, list(pyrs = sum(lex.dur), obs = sum(lex.Xst)), keyby = e] + x <- x[pyrs > 0 & !is.na(pyrs)] + + if (!is.data.table(x2)) setDF2DT(x2) + if (!is.data.table(x3)) setDF2DT(x3) + if (!is.data.table(x4)) setDF2DT(x4) + if (!is.data.table(x5)) setDF2DT(x5) + + setkeyv(x, v) + setkeyv(x2, v) + setkeyv(x3, v) + setkeyv(x4, v) + setkeyv(x5, v) + + expect_equal(x2$pyrs, x$pyrs, tolerance = 1e-05) + expect_equal(x2$from0to1, x$obs, tolerance = 1e-05) + + expect_equal(sum(x2$pyrs), sum(x3$pyrs), tolerance = 1e-05) + expect_equal(sum(x2$from0to1), sum(x3$from0to1), tolerance = 1e-05) + + expect_equal(sum(x2$pyrs), sum(x4$pyrs), tolerance = 1e-05) + expect_equal(sum(x2$from0to1), sum(x4$from0to1), tolerance = 1e-05) + + expect_equal(x3$pyrs, x5$pyrs, tolerance = 1e-05) + expect_equal(x3$from0to0, x5$from0to0, tolerance = 1e-05) + expect_equal(sum(x3$from0to1), sum(x5$from0to1), tolerance = 1e-05) + + expect_equal(x2$pyrs, x4$pyrs, tolerance = 1e-05) + expect_equal(x2$from0to0, x4$from0to0, tolerance = 1e-05) + expect_equal(sum(x2$from0to1), sum(x4$from0to1), tolerance = 1e-05) +}) + + +test_that("aggre()'s by argument works flexibly", { + library(Epi) + BL <- list(fot = 0:5, per = c(1995,2015)) + for (cond in c(FALSE, TRUE)) { + x <- Lexis(data = sire[dg_date < ex_date,][1:500, ], entry = list(fot = 0, age = dg_age, per = get.yrs(dg_date)), + exit = list(per = get.yrs(ex_date)), exit.status = status, + entry.status = 0) + x <- splitMulti(x, breaks = BL) + setDF(x) + setattr(x, "class", c("Lexis", "data.frame")) + x$agegr <- cut(x$dg_age, 2) + if (cond) { + forceLexisDT(x, breaks = BL, allScales = c("fot", "per", "age")) + alloc.col(x) + } + + a <- aggre(x, by = list(agegr = cut(dg_age, 2), sex, fot, per = per), type = "unique") + b <- aggre(x, by = c("agegr", "sex", "fot", "per"), type = "unique") + + expect_equal(a, b) + + a <- aggre(x, by = cut(dg_age, 2), type = "unique") + setnames(a, "cut", "agegr") + attr(a, "aggre.meta")$by <- "agegr" + b <- aggre(x, by = c("agegr"), type = "unique") + c <- aggre(x, by = list(agegr = cut(dg_age, 2)), type = "unique") + d<- aggre(x, by = agegr, type = "unique") + + expect_equal(a, b) + expect_equal(b, c) + expect_equal(c, d) + } + + +}) + +test_that("subset argument works properly", { + + + x <- sire[dg_date < ex_date, ][1:1000,] + BL <- list(fot= seq(0,20,1/12), age= c(0:100, Inf), per= c(1960:2014)) + x <- lexpand(x, birth = bi_date, entry = dg_date, exit = ex_date, + status = status %in% 1:2, breaks=BL) + # setDT2DF(x) + x2 <- x[x$dg_age <= 55L, ] + + setDT(x) + setDT(x2) + forceLexisDT(x, breaks = BL, allScales = c("fot", "per", "age"), key = FALSE) + forceLexisDT(x2, breaks = BL, allScales = c("fot", "per", "age"), key = FALSE) + + ag <- quote(list(gender = factor(sex, 1, "f"), sex, surv.int = fot, per, agegr = age)) + ag1 <- aggre(x, by = ag, subset = dg_age <= 55L) + ag2 <- aggre(x2, by = ag) + + ag3 <- aggre(x, by = ag, type = "full", subset = dg_age <= 55L) + ag4 <- aggre(x2, by = ag, type = "full") + + expect_identical(ag1, ag2) + expect_identical(ag3, ag4) + +}) + + +test_that("at.risk column works as intended", { + ## normal case - no late entry. Just lots of breaks. + skip_on_cran() + x <- sire[dg_date < ex_date, ][1:1000,] + BL <- list(fot= seq(0,20,1/12), age= c(0:100, Inf), per= c(1960:2014)) + + x <- Lexis(data = x, + entry = list(fot = 0, age = dg_age, per = get.yrs(dg_date)), + exit = list(per = get.yrs(ex_date)), exit.status = status, + entry.status = 0) + + x <- splitMulti(x, breaks = BL, drop = TRUE) + + ag <- aggre(x, by = list(sex, fot)) + setkey(ag, sex, fot) + + ## total events and changes in at.risk should be congruent here + ag[, ndiff := at.risk - c(at.risk[-1], NA), by = list(sex)] + ag[!is.na(ndiff), events := from0to0 + from0to1 + from0to2] + + expect_equal(ag$ndiff, ag$events) + + ## compare at.risk with manually computed at.risk and events + x[, evented := detectEvents(x, breaks = attr(x, "breaks"), by = "lex.id") != 0L] + x[, normalEntry := fot %in% BL$fot] + x[, cutFot := cutLow(fot, BL$fot)] + byDT <- CJ(sex = 1, + cutFot = BL$fot[-length(BL$fot)]) + n.start <- x[byDT, .(sum(normalEntry & !duplicated(lex.id)), + sum(evented)), by = .EACHI, + on = names(byDT)] + n.start[is.na(ag$ndiff), V2 := NA] + expect_equal(ag$at.risk, n.start$V1) + expect_equal(ag$ndiff, n.start$V2) +}) + + + +test_that("at.risk column works as intended, Vol. 2", { + skip_on_cran() + ## period analysis case - some observations are late entry. + data(sire) + + BL <- list(fot=seq(0, 5, by = 1/12), + per = c(2008,2013)) + + x <- Lexis(data = sire[dg_date < ex_date,], + entry = list(fot = 0, age = dg_age, per = get.yrs(dg_date)), + exit = list(per = get.yrs(ex_date)), exit.status = status, + entry.status = 0) + + x <- splitMulti(x, breaks = BL, drop = TRUE) + + a <- aggre(x, by = list(sex, per, fot)) + setkey(a, sex, per, fot) + a[, ndiff := at.risk - c(at.risk[-1], NA), by = list(sex, per)] + a[!is.na(ndiff), events := from0to0 + from0to1 + from0to2] + + x[, normalEntry := fot %in% BL$fot] + x[, cutPer := cutLow(per, BL$per)] + x[, cutFot := cutLow(fot, BL$fot)] + byDT <- CJ(sex = 1, cutPer = BL$per[-length(BL$per)], + cutFot = BL$fot[-length(BL$fot)]) + n.start <- x[byDT, sum(normalEntry & !duplicated(lex.id)), by = .EACHI, + on = names(byDT)] + + expect_equal(a$at.risk, n.start$V1) +}) diff --git a/tests/testthat/test_epi.R b/tests/testthat/test_epi.R index bf34b9c..b9cad52 100644 --- a/tests/testthat/test_epi.R +++ b/tests/testthat/test_epi.R @@ -1,34 +1,34 @@ -context("Epi subsetting methods OK") - - - - - -test_that("[.Epi exists and works", { - - xcoh <- structure( list( id = c("A", "B", "C"), - birth = c("14/07/1952", "01/04/1954", "10/06/1987"), - entry = c("04/08/1965", "08/09/1972", "23/12/1991"), - exit = c("27/06/1997", "23/05/1995", "24/07/1998"), - fail = c(1, 0, 1) ), - .Names = c("id", "birth", "entry", "exit", "fail"), - row.names = c("1", "2", "3"), - class = "data.frame" ) - - - xcoh <- cal.yr( xcoh, format="%d/%m/%Y", wh=2:4 ) - - Lcoh <- Lexis( entry = list( per=entry ), - exit = list( per=exit, age=exit-birth ), - exit.status = fail, - data = xcoh ) - - e1 <- subset(Lcoh, fail == 1) - e2 <- Lcoh[Lcoh$fail == 1, ] - - expect_identical(e1, e2) -}) - - - - +context("Epi subsetting methods OK") + + + + + +test_that("[.Epi exists and works", { + + xcoh <- structure( list( id = c("A", "B", "C"), + birth = c("14/07/1952", "01/04/1954", "10/06/1987"), + entry = c("04/08/1965", "08/09/1972", "23/12/1991"), + exit = c("27/06/1997", "23/05/1995", "24/07/1998"), + fail = c(1, 0, 1) ), + .Names = c("id", "birth", "entry", "exit", "fail"), + row.names = c("1", "2", "3"), + class = "data.frame" ) + + + xcoh <- cal.yr( xcoh, format="%d/%m/%Y", wh=2:4 ) + + Lcoh <- Lexis( entry = list( per=entry ), + exit = list( per=exit, age=exit-birth ), + exit.status = fail, + data = xcoh ) + + e1 <- subset(Lcoh, fail == 1) + e2 <- Lcoh[Lcoh$fail == 1, ] + + expect_identical(e1, e2) +}) + + + + diff --git a/tests/testthat/test_expo.R b/tests/testthat/test_expo.R index d8b7984..d883d3e 100644 --- a/tests/testthat/test_expo.R +++ b/tests/testthat/test_expo.R @@ -1,47 +1,47 @@ -context("Testing aggregation by categories of exposure") - -test_that("prepExpo works in the simple case", { - skip_on_cran() - library(Epi) - - df <- data.frame(id = "A", birth = c(1952.4534), - entry = c(1965.4746, 1972.42845, 1991.78643), - exit = c(1968.56346, 1979.32478, 1997.32432), fail = 0) - - # Define as Lexis object with timescales calendar time and age - x <- Lexis( entry = list(work = 0, per=entry ), - exit = list( per=exit, age=exit-birth ), - exit.status = fail, id = id, - data = df ) - - x2 <- prepExpo(x, freezeScales = "work", - cutScale = "per", - entry = 1964, - exit = 2012, by = "lex.id") - cd <- cumsum(x$lex.dur) - exp_work <- c(0, 0, cd[1], cd[1], cd[2], cd[2], cd[3]) - expect_equal(x2$work, exp_work) - expect_equal(x2$per, 1964+c(0,cumsum(x2$lex.dur)[-nrow(x2)])) - - BL <- list(work = 0:50, age = c(0,18,Inf), per = 1963:2014) - x2 <- prepExpo(x, freezeScales = "work", - cutScale = "per", - entry = 1964, - # verbose = TRUE, - exit = 2012, by = "lex.id", - breaks = BL) - ag <- aggre(x2, by = list(lex.id, per, age)) - - xx <- Lexis(entry = list(per = 1964, age = 1964-birth), exit = list(per=2012), data = df[1,]) - ag2 <- splitMulti(xx, breaks = BL[c("per","age")]) - ag2 <- aggre(ag2, by = list(lex.id, per, age)) - - setkeyv(ag, c("lex.id","per")) - setkeyv(ag2, c("lex.id","per")) - - expect_equal(ag$pyrs, ag2$pyrs) - - - - -}) +context("Testing aggregation by categories of exposure") + +test_that("prepExpo works in the simple case", { + skip_on_cran() + library(Epi) + + df <- data.frame(id = "A", birth = c(1952.4534), + entry = c(1965.4746, 1972.42845, 1991.78643), + exit = c(1968.56346, 1979.32478, 1997.32432), fail = 0) + + # Define as Lexis object with timescales calendar time and age + x <- Lexis( entry = list(work = 0, per=entry ), + exit = list( per=exit, age=exit-birth ), + exit.status = fail, id = id, + data = df ) + + x2 <- prepExpo(x, freezeScales = "work", + cutScale = "per", + entry = 1964, + exit = 2012, by = "lex.id") + cd <- cumsum(x$lex.dur) + exp_work <- c(0, 0, cd[1], cd[1], cd[2], cd[2], cd[3]) + expect_equal(x2$work, exp_work) + expect_equal(x2$per, 1964+c(0,cumsum(x2$lex.dur)[-nrow(x2)])) + + BL <- list(work = 0:50, age = c(0,18,Inf), per = 1963:2014) + x2 <- prepExpo(x, freezeScales = "work", + cutScale = "per", + entry = 1964, + # verbose = TRUE, + exit = 2012, by = "lex.id", + breaks = BL) + ag <- aggre(x2, by = list(lex.id, per, age)) + + xx <- Lexis(entry = list(per = 1964, age = 1964-birth), exit = list(per=2012), data = df[1,]) + ag2 <- splitMulti(xx, breaks = BL[c("per","age")]) + ag2 <- aggre(ag2, by = list(lex.id, per, age)) + + setkeyv(ag, c("lex.id","per")) + setkeyv(ag2, c("lex.id","per")) + + expect_equal(ag$pyrs, ag2$pyrs) + + + + +}) diff --git a/tests/testthat/test_lexpand.R b/tests/testthat/test_lexpand.R index 2d0fee6..3efcf94 100644 --- a/tests/testthat/test_lexpand.R +++ b/tests/testthat/test_lexpand.R @@ -1,372 +1,372 @@ -context("lexpand sanity checks") - - -test_that("lexpand arguments can be passed as symbol, expression, character name of variable, and symbol of a character variable", { - skip_on_cran() - sr <- copy(sire)[dg_date < ex_date, ][1:100,] - sr[, id := as.character(1:.N)] - - x <- lexpand(sr, fot = c(0, Inf), - birth = "bi_date", entry = dg_date, exit = "ex_date", - status = status %in% 1:2, id = "id") - - x2 <- lexpand(sr, fot = c(0, Inf), - birth = bi_date, entry = "dg_date", exit = ex_date, - status = status %in% 1:2, id = id) - - - x3 <- lexpand(sr, fot = c(0, Inf), - birth = bi_date, entry = dg_date, exit = ex_date, - status = status %in% 1:2, id = id) - - expect_identical(x, x2) - expect_identical(x, x3) -}) - - - -test_that("original total pyrs equals pyrs after splitting w/ large number of breaks", { - skip_on_cran() - x <- copy(sire)[dg_date < ex_date, ] - x[, fot := get.yrs(ex_date, year.length = "actual") - get.yrs(dg_date, year.length = "actual")] - totpyrs <- x[, sum(fot)] - - x <- lexpand(sire, birth = bi_date, entry = dg_date, exit = ex_date, - status = status %in% 1:2, - breaks=list(fot= seq(0,20,1/12), age= c(0:100, Inf), per= c(1960:2014))) - setDT(x) - totpyrs_split <- x[, sum(lex.dur)] - - expect_equal(totpyrs, totpyrs_split, tolerance = 1e-05) -}) - - - -test_that("pp not added to data if pp = FALSE but pop.haz is", { - x <- lexpand(sire[dg_date < ex_date, ][0:100], - birth = bi_date, entry = dg_date, exit = ex_date, - status = status %in% 1:2, - breaks=list(fot=0:5), - pophaz=data.table(popEpi::popmort), - pp = FALSE) - expect_equal(intersect(names(x), c("pp", "pop.haz")), "pop.haz") - expect_true(!any(is.na(x$pop.haz))) -}) - - - -test_that("lexpand produces the same results with internal/external dropping", { - skip_usually() - x <- lexpand(sire[dg_date < ex_date, ], - birth = bi_date, entry = dg_date, exit = ex_date, - status = status %in% 1:2, - breaks=list(fot=0:5), pophaz=data.table(popEpi::popmort), - pp = TRUE, drop = TRUE) - x2 <-lexpand(sire[dg_date < ex_date, ], - birth = bi_date, entry = dg_date, exit = ex_date, - status = status %in% 1:2, - breaks=list(fot=0:5), pophaz=data.table(popEpi::popmort), - pp = TRUE, drop = FALSE) - x2 <-popEpi:::intelliDrop(x2, breaks = list(fot=0:5), dropNegDur = TRUE) - setDT(x) - setDT(x2) - popEpi:::doTestBarrage(dt1 = x, dt2 = x2, allScales = c("fot", "per", "age")) -}) - - -test_that("lexpanding with aggre.type = 'unique' works", { - skip_usually() - - BL <- list(fot = 0:5, age = seq(0,100, 5)) - ag1 <- lexpand(sire[dg_date < ex_date, ], - breaks = BL, status = status, - birth = bi_date, entry = dg_date, exit = ex_date) - setDT(ag1) - ag1 <- ag1[, list(pyrs = sum(lex.dur), from0to1 = sum(lex.Xst == 1L)), - keyby = list(fot = popEpi:::cutLow(fot, BL$fot), - age = popEpi:::cutLow(age, BL$age))] - ag2 <- lexpand(sire[dg_date < ex_date, ], - breaks = BL, status = status, - birth = bi_date, entry = dg_date, exit = ex_date, - aggre = list(fot, age), aggre.type = "unique") - setDT(ag2) - expect_equal(ag1$pyrs, ag2$pyrs) - expect_equal(ag1$from0to1, ag2$from0to1) - -}) - -test_that("lexpanding with aggre.type = 'cartesian' works; no time scales used", { - skip_usually() - - BL <- list(fot = c(0,Inf)) - ag1 <- lexpand(sire[dg_date < ex_date, ], - breaks = BL, status = status, entry.status = 0L, - birth = bi_date, entry = dg_date, exit = ex_date) - setDT(ag1) - forceLexisDT(ag1, breaks = BL, allScales = c("fot", "per", "age")) - - e <- quote(list(sex = factor(sex, 0:1, c("m", "f")), - period = cut(get.yrs(dg_date), get.yrs(as.Date(paste0(seq(1970, 2015, 5), "-01-01")))))) - ag1[, c("sex", "period") := eval(e)] - ceejay <- do.call(CJ, lapply(ag1[, list(sex, period)], function(x) {if (is.factor(x)) levels(x) else unique(x)})) - setkey(ceejay, sex, period); setkey(ag1, sex, period) - ag1 <- ag1[ceejay, list(pyrs = sum(lex.dur), - from0to1 = sum(lex.Xst == 1L)), by = .EACHI] - ag1[is.na(pyrs), pyrs := 0] - ag1[is.na(from0to1), from0to1 := 0] - - ag2 <- lexpand(sire[dg_date < ex_date, ], - breaks = BL, - status = status, entry.status = 0L, - birth = bi_date, entry = dg_date, exit = ex_date, - aggre = list(sex = factor(sex, 0:1, c("m", "f")), - period = cut(get.yrs(dg_date), get.yrs(as.Date(paste0(seq(1970, 2015, 5), "-01-01"))))), - aggre.type = "cartesian") - - setDT(ag2) - setkeyv(ag1, c("sex", "period")) - setkeyv(ag2, c("sex", "period")) - expect_equal(sum(ag1$pyrs), sum(ag2$pyrs)) - expect_equal(sum(ag1$from0to1), sum(ag2$from0to1)) - expect_equal(ag1$pyrs, ag2$pyrs) - expect_equal(ag1$from0to1, ag2$from0to1) - -}) - -test_that("lexpanding with aggre.type = 'cartesian' works; only time scales used", { - skip_usually() - - BL <- list(fot = 0:5, age = seq(0,100, 5)) - ag1 <- lexpand(sire[dg_date < ex_date, ], - breaks = BL, status = status, entry.status = 0L, - birth = bi_date, entry = dg_date, exit = ex_date) - setDT(ag1) - forceLexisDT(ag1, breaks = BL, allScales = c("fot", "per", "age")) - - ag3 <- aggre(ag1, by = list(fot, age), type = "cartesian") - setDT(ag3) - - ag4 <- aggre(ag1, by = list(fot, age), type = "unique") - setDT(ag4) - - ag1[, `:=`(fot = try2int(popEpi:::cutLow(fot, c(BL$fot, Inf))), - age = try2int(popEpi:::cutLow(age, c(BL$age, Inf))))] - ceejay <- do.call(CJ, lapply(BL, function(x) x[-length(x)])) - setkey(ceejay, fot, age); setkey(ag1, fot, age) - ag1 <- ag1[ceejay, list(pyrs = sum(lex.dur), - from0to1 = sum(lex.Xst == 1L)), by = .EACHI] - ag1[is.na(pyrs), pyrs := 0] - ag1[is.na(from0to1), from0to1 := 0] - - ag2 <- lexpand(sire[dg_date < ex_date, ], - breaks = list(fot = 0:5, age = seq(0,100, 5)), - status = status, entry.status = 0L, - birth = bi_date, entry = dg_date, exit = ex_date, - aggre = list(fot, age), aggre.type = "cartesian") - - setDT(ag2) - setkeyv(ag1, c("fot", "age")) - setkeyv(ag2, c("fot", "age")) - setkeyv(ag3, c("fot", "age")) - expect_equal(sum(ag1$pyrs), sum(ag3$pyrs)) - expect_equal(sum(ag1$from0to1), sum(ag3$from0to1)) - expect_equal(ag1$pyrs, ag3$pyrs) - expect_equal(ag1$from0to1, ag3$from0to1) - - expect_equal(sum(ag1$pyrs), sum(ag2$pyrs)) - expect_equal(sum(ag1$from0to1), sum(ag2$from0to1)) - expect_equal(ag1$pyrs, ag2$pyrs) - expect_equal(ag1$from0to1, ag2$from0to1) - -}) - - -test_that("lexpanding and aggregating to years works", { - ag1 <- lexpand(sire[dg_date < ex_date, ], - breaks = list(per=2000:2014), status = status, - birth = bi_date, entry = dg_date, exit = ex_date) - setDT(ag1) - ag1[, `:=`(per = as.integer(popEpi:::cutLow(per, 2000:2014)))] - ag1 <- ag1[, list(pyrs = sum(lex.dur), from0to1 = sum(lex.Xst == 1L)), keyby = per] - - ag2 <- lexpand(sire[dg_date < ex_date, ], - breaks = list(per = 2000:2014), status = status, - birth = bi_date, entry = dg_date, exit = ex_date, - aggre = list(per), aggre.type = "unique") - setDT(ag2) - ag3 <- lexpand(sire[dg_date < ex_date, ], - breaks = list(per = 2000:2014, age = c(seq(0,100,5),Inf), fot = c(0:10, Inf)), - status = status, - birth = bi_date, entry = dg_date, exit = ex_date, - aggre = list(y = per), aggre.type = "unique") - setDT(ag3) - expect_equal(ag1$pyrs, ag2$pyrs) - expect_equal(ag1$from0to1, ag2$from0to1) - expect_equal(ag1$pyrs, ag3$pyrs) - expect_equal(ag1$from0to1, ag3$from0to1) - -}) - -# Aggre check (to totpyrs) ----------------------------------------------------- - -test_that("lexpand aggre produces correct results", { - skip_on_cran() - x <- copy(sire)[dg_date < ex_date, ] - x[, fot := get.yrs(ex_date, year.length = "actual") - get.yrs(dg_date, year.length = "actual")] - totpyrs <- x[, sum(fot)] - counts <- x[, .N, by = .(status)] - - x <- lexpand(sire[dg_date < ex_date, ], - birth = bi_date, entry = dg_date, exit = ex_date, - breaks=list(fot=c(0,5,10,50,Inf), age=c(seq(0,85,5),Inf), per = 1993:2013), - status=status, aggre = list(fot, age, per)) - setDT(x) - row_length <- x[,list( length(unique(age)), length(unique(per)), length(unique(fot)))] - - expect_equal( x[,sum(pyrs)], totpyrs, tolerance = 0.001) - expect_equal( x[,sum(from0to0)], counts[1,N]) - expect_equal( x[,sum(from0to1)], counts[2,N]) - expect_equal( x[,sum(from0to2)], counts[3,N]) - #expect_equal( prod(row_length), x[,.N]) -}) - -test_that('lexpand aggre: multistate column names correct', { - - x <- lexpand(sire[dg_date < ex_date, ][0:100], - birth = bi_date, entry = dg_date, exit = ex_date, - breaks=list(fot=c(0,5,10,50,Inf), age=c(seq(0,85,5),Inf), - per = 1993:2013), - status=status, aggre = list(fot, age, per)) - - expect_equal(intersect(names(x), c('from0to0','from0to1','from0to2')), - c('from0to0','from0to1','from0to2')) -}) - - -# overlapping time lines -------------------------------------------------- - -test_that('lexpansion w/ overlapping = TRUE/FALSE produces double/undoubled pyrs', { - skip_usually() - - sire2 <- copy(sire)[dg_date < ex_date, ][1:100] - sire2[, dg_yrs := get.yrs(dg_date, "actual")] - sire2[, ex_yrs := get.yrs(ex_date, "actual")] - sire2[, bi_yrs := get.yrs(bi_date, "actual")] - sire2[, id := 1:.N] - sire2 <- sire2[rep(1:.N, each=2)] - - sire2[seq(2,.N, by=2), dg_yrs := (ex_yrs + dg_yrs)/2L] - sire2[, dg_age := dg_yrs-bi_yrs] - - x <- lexpand(sire2, birth = "bi_yrs", entry = "bi_yrs", event="dg_yrs", - exit = "ex_yrs", status="status", entry.status = 0L, id = "id", - overlapping = TRUE) - setDT(x) - expect_equal(x[, sum(lex.dur), keyby=lex.id]$V1, sire2[, sum(ex_yrs-bi_yrs), keyby=id]$V1) - - x <- lexpand(sire2, birth = "bi_yrs", entry = "bi_yrs", event="dg_yrs", - exit = "ex_yrs", status="status", entry.status = 0L, id = "id", - overlapping = FALSE) - setDT(x) - expect_equal(x[, sum(lex.dur), keyby=lex.id]$V1, - sire2[!duplicated(id), sum(ex_yrs-bi_yrs), keyby=id]$V1) -}) - - - -test_that("different specifications of time vars work with event defined and overlapping=FALSE", { - - dt <- data.table(bi_date = as.Date('1949-01-01'), - dg_date = as.Date(paste0(1999:2000, "-01-01")), - start = as.Date("1997-01-01"), - end = as.Date('2002-01-01'), - status = c(1,2), id=1) - - ## birth -> entry -> event -> exit - x1 <- lexpand(data = dt, subset = NULL, - birth = bi_date, entry = start, exit = end, event = dg_date, - id = id, overlapping = FALSE, entry.status = 0, status = status, - merge = FALSE) - expect_equal(x1$lex.dur, c(2,1,2)) - expect_equal(x1$age, c(48,50,51)) - expect_equal(x1$lex.Cst, 0:2) - expect_equal(x1$lex.Xst, c(1,2,2)) - - ## birth -> entry = event -> exit - expect_error( - lexpand(data = dt, subset = NULL, - birth = bi_date, entry = dg_date, exit = end, event = dg_date, - id = id, overlapping = FALSE, entry.status = 0, status = status, - merge = FALSE), - regexp = paste0("some rows have simultaneous 'entry' and 'event', ", - "which is not supported with overlapping = FALSE; ", - "perhaps separate them by one day?") - ) - - ## birth = entry -> event -> exit - x3 <- lexpand(data = dt, subset = NULL, - birth = bi_date, entry = bi_date, exit = end, event = dg_date, - id = id, overlapping = FALSE, entry.status = 0, status = status, - merge = FALSE) - expect_equal(x3$lex.dur, c(50,1,2)) - expect_equal(x3$age, c(0,50,51)) - expect_equal(x3$lex.Cst, 0:2) - expect_equal(x3$lex.Xst, c(1,2,2)) - - ## birth -> entry -> event = exit - expect_error( - lexpand(data = dt, subset = NULL, - birth = bi_date, entry = dg_date, exit = end, event = end, - id = id, overlapping = FALSE, entry.status = 0, status = status, - merge = FALSE), - regexp = paste0("subject\\(s\\) defined by lex.id had several rows ", - "where 'event' time had the same value, which is not ", - "supported with overlapping = FALSE; perhaps separate ", - "them by one day?") - ) - - ## birth = entry -> event -> exit - x6 <- lexpand(data = dt, subset = NULL, - birth = bi_date, entry = bi_date, exit = end, event = dg_date, - id = id, overlapping = FALSE, entry.status = 0, status = status, - merge = FALSE) - expect_equal(x6$lex.dur, c(50,1,2)) - expect_equal(x6$age, c(0,50,51)) - expect_equal(x6$lex.Cst, 0:2) - expect_equal(x6$lex.Xst, c(1,2,2)) - -}) - - -test_that("lexpand drops persons outside breaks window correctly", { - skip_usually() - - dt <- data.table(bi_date = as.Date('1949-01-01'), - dg_date = as.Date(paste0(2000, "-01-01")), - start = as.Date("1997-01-01"), - end = as.Date('2002-01-01'), - status = c(2), id=1) - - ## by age - x1 <- lexpand(data = dt, subset = NULL, - birth = bi_date, entry = start, exit = end, event = dg_date, - id = id, overlapping = FALSE, entry.status = 0, status = status, - merge = FALSE, breaks = list(age = 50:55)) - expect_equal(x1$age, c(50, 51, 52)) - - ## by period - x1 <- lexpand(data = dt, subset = NULL, - birth = bi_date, entry = start, exit = end, event = dg_date, - id = id, overlapping = FALSE, entry.status = 0, status = status, - merge = FALSE, breaks = list(per = 2000:2005)) - expect_equal(x1$per, c(2000, 2001)) - - - ## by fot - x1 <- lexpand(data = dt, subset = NULL, - birth = bi_date, entry = start, exit = end, event = dg_date, - id = id, overlapping = FALSE, entry.status = 0, status = status, - merge = FALSE, breaks = list(fot = 2:5)) - expect_equal(x1$fot, c(2, 3, 4)) -}) - +context("lexpand sanity checks") + + +test_that("lexpand arguments can be passed as symbol, expression, character name of variable, and symbol of a character variable", { + skip_on_cran() + sr <- copy(sire)[dg_date < ex_date, ][1:100,] + sr[, id := as.character(1:.N)] + + x <- lexpand(sr, fot = c(0, Inf), + birth = "bi_date", entry = dg_date, exit = "ex_date", + status = status %in% 1:2, id = "id") + + x2 <- lexpand(sr, fot = c(0, Inf), + birth = bi_date, entry = "dg_date", exit = ex_date, + status = status %in% 1:2, id = id) + + + x3 <- lexpand(sr, fot = c(0, Inf), + birth = bi_date, entry = dg_date, exit = ex_date, + status = status %in% 1:2, id = id) + + expect_identical(x, x2) + expect_identical(x, x3) +}) + + + +test_that("original total pyrs equals pyrs after splitting w/ large number of breaks", { + skip_on_cran() + x <- copy(sire)[dg_date < ex_date, ] + x[, fot := get.yrs(ex_date, year.length = "actual") - get.yrs(dg_date, year.length = "actual")] + totpyrs <- x[, sum(fot)] + + x <- lexpand(sire, birth = bi_date, entry = dg_date, exit = ex_date, + status = status %in% 1:2, + breaks=list(fot= seq(0,20,1/12), age= c(0:100, Inf), per= c(1960:2014))) + setDT(x) + totpyrs_split <- x[, sum(lex.dur)] + + expect_equal(totpyrs, totpyrs_split, tolerance = 1e-05) +}) + + + +test_that("pp not added to data if pp = FALSE but pop.haz is", { + x <- lexpand(sire[dg_date < ex_date, ][0:100], + birth = bi_date, entry = dg_date, exit = ex_date, + status = status %in% 1:2, + breaks=list(fot=0:5), + pophaz=data.table(popEpi::popmort), + pp = FALSE) + expect_equal(intersect(names(x), c("pp", "pop.haz")), "pop.haz") + expect_true(!any(is.na(x$pop.haz))) +}) + + + +test_that("lexpand produces the same results with internal/external dropping", { + skip_usually() + x <- lexpand(sire[dg_date < ex_date, ], + birth = bi_date, entry = dg_date, exit = ex_date, + status = status %in% 1:2, + breaks=list(fot=0:5), pophaz=data.table(popEpi::popmort), + pp = TRUE, drop = TRUE) + x2 <-lexpand(sire[dg_date < ex_date, ], + birth = bi_date, entry = dg_date, exit = ex_date, + status = status %in% 1:2, + breaks=list(fot=0:5), pophaz=data.table(popEpi::popmort), + pp = TRUE, drop = FALSE) + x2 <-popEpi:::intelliDrop(x2, breaks = list(fot=0:5), dropNegDur = TRUE) + setDT(x) + setDT(x2) + popEpi:::doTestBarrage(dt1 = x, dt2 = x2, allScales = c("fot", "per", "age")) +}) + + +test_that("lexpanding with aggre.type = 'unique' works", { + skip_usually() + + BL <- list(fot = 0:5, age = seq(0,100, 5)) + ag1 <- lexpand(sire[dg_date < ex_date, ], + breaks = BL, status = status, + birth = bi_date, entry = dg_date, exit = ex_date) + setDT(ag1) + ag1 <- ag1[, list(pyrs = sum(lex.dur), from0to1 = sum(lex.Xst == 1L)), + keyby = list(fot = popEpi:::cutLow(fot, BL$fot), + age = popEpi:::cutLow(age, BL$age))] + ag2 <- lexpand(sire[dg_date < ex_date, ], + breaks = BL, status = status, + birth = bi_date, entry = dg_date, exit = ex_date, + aggre = list(fot, age), aggre.type = "unique") + setDT(ag2) + expect_equal(ag1$pyrs, ag2$pyrs) + expect_equal(ag1$from0to1, ag2$from0to1) + +}) + +test_that("lexpanding with aggre.type = 'cartesian' works; no time scales used", { + skip_usually() + + BL <- list(fot = c(0,Inf)) + ag1 <- lexpand(sire[dg_date < ex_date, ], + breaks = BL, status = status, entry.status = 0L, + birth = bi_date, entry = dg_date, exit = ex_date) + setDT(ag1) + forceLexisDT(ag1, breaks = BL, allScales = c("fot", "per", "age")) + + e <- quote(list(sex = factor(sex, 0:1, c("m", "f")), + period = cut(get.yrs(dg_date), get.yrs(as.Date(paste0(seq(1970, 2015, 5), "-01-01")))))) + ag1[, c("sex", "period") := eval(e)] + ceejay <- do.call(CJ, lapply(ag1[, list(sex, period)], function(x) {if (is.factor(x)) levels(x) else unique(x)})) + setkey(ceejay, sex, period); setkey(ag1, sex, period) + ag1 <- ag1[ceejay, list(pyrs = sum(lex.dur), + from0to1 = sum(lex.Xst == 1L)), by = .EACHI] + ag1[is.na(pyrs), pyrs := 0] + ag1[is.na(from0to1), from0to1 := 0] + + ag2 <- lexpand(sire[dg_date < ex_date, ], + breaks = BL, + status = status, entry.status = 0L, + birth = bi_date, entry = dg_date, exit = ex_date, + aggre = list(sex = factor(sex, 0:1, c("m", "f")), + period = cut(get.yrs(dg_date), get.yrs(as.Date(paste0(seq(1970, 2015, 5), "-01-01"))))), + aggre.type = "cartesian") + + setDT(ag2) + setkeyv(ag1, c("sex", "period")) + setkeyv(ag2, c("sex", "period")) + expect_equal(sum(ag1$pyrs), sum(ag2$pyrs)) + expect_equal(sum(ag1$from0to1), sum(ag2$from0to1)) + expect_equal(ag1$pyrs, ag2$pyrs) + expect_equal(ag1$from0to1, ag2$from0to1) + +}) + +test_that("lexpanding with aggre.type = 'cartesian' works; only time scales used", { + skip_usually() + + BL <- list(fot = 0:5, age = seq(0,100, 5)) + ag1 <- lexpand(sire[dg_date < ex_date, ], + breaks = BL, status = status, entry.status = 0L, + birth = bi_date, entry = dg_date, exit = ex_date) + setDT(ag1) + forceLexisDT(ag1, breaks = BL, allScales = c("fot", "per", "age")) + + ag3 <- aggre(ag1, by = list(fot, age), type = "cartesian") + setDT(ag3) + + ag4 <- aggre(ag1, by = list(fot, age), type = "unique") + setDT(ag4) + + ag1[, `:=`(fot = try2int(popEpi:::cutLow(fot, c(BL$fot, Inf))), + age = try2int(popEpi:::cutLow(age, c(BL$age, Inf))))] + ceejay <- do.call(CJ, lapply(BL, function(x) x[-length(x)])) + setkey(ceejay, fot, age); setkey(ag1, fot, age) + ag1 <- ag1[ceejay, list(pyrs = sum(lex.dur), + from0to1 = sum(lex.Xst == 1L)), by = .EACHI] + ag1[is.na(pyrs), pyrs := 0] + ag1[is.na(from0to1), from0to1 := 0] + + ag2 <- lexpand(sire[dg_date < ex_date, ], + breaks = list(fot = 0:5, age = seq(0,100, 5)), + status = status, entry.status = 0L, + birth = bi_date, entry = dg_date, exit = ex_date, + aggre = list(fot, age), aggre.type = "cartesian") + + setDT(ag2) + setkeyv(ag1, c("fot", "age")) + setkeyv(ag2, c("fot", "age")) + setkeyv(ag3, c("fot", "age")) + expect_equal(sum(ag1$pyrs), sum(ag3$pyrs)) + expect_equal(sum(ag1$from0to1), sum(ag3$from0to1)) + expect_equal(ag1$pyrs, ag3$pyrs) + expect_equal(ag1$from0to1, ag3$from0to1) + + expect_equal(sum(ag1$pyrs), sum(ag2$pyrs)) + expect_equal(sum(ag1$from0to1), sum(ag2$from0to1)) + expect_equal(ag1$pyrs, ag2$pyrs) + expect_equal(ag1$from0to1, ag2$from0to1) + +}) + + +test_that("lexpanding and aggregating to years works", { + ag1 <- lexpand(sire[dg_date < ex_date, ], + breaks = list(per=2000:2014), status = status, + birth = bi_date, entry = dg_date, exit = ex_date) + setDT(ag1) + ag1[, `:=`(per = as.integer(popEpi:::cutLow(per, 2000:2014)))] + ag1 <- ag1[, list(pyrs = sum(lex.dur), from0to1 = sum(lex.Xst == 1L)), keyby = per] + + ag2 <- lexpand(sire[dg_date < ex_date, ], + breaks = list(per = 2000:2014), status = status, + birth = bi_date, entry = dg_date, exit = ex_date, + aggre = list(per), aggre.type = "unique") + setDT(ag2) + ag3 <- lexpand(sire[dg_date < ex_date, ], + breaks = list(per = 2000:2014, age = c(seq(0,100,5),Inf), fot = c(0:10, Inf)), + status = status, + birth = bi_date, entry = dg_date, exit = ex_date, + aggre = list(y = per), aggre.type = "unique") + setDT(ag3) + expect_equal(ag1$pyrs, ag2$pyrs) + expect_equal(ag1$from0to1, ag2$from0to1) + expect_equal(ag1$pyrs, ag3$pyrs) + expect_equal(ag1$from0to1, ag3$from0to1) + +}) + +# Aggre check (to totpyrs) ----------------------------------------------------- + +test_that("lexpand aggre produces correct results", { + skip_on_cran() + x <- copy(sire)[dg_date < ex_date, ] + x[, fot := get.yrs(ex_date, year.length = "actual") - get.yrs(dg_date, year.length = "actual")] + totpyrs <- x[, sum(fot)] + counts <- x[, .N, by = .(status)] + + x <- lexpand(sire[dg_date < ex_date, ], + birth = bi_date, entry = dg_date, exit = ex_date, + breaks=list(fot=c(0,5,10,50,Inf), age=c(seq(0,85,5),Inf), per = 1993:2013), + status=status, aggre = list(fot, age, per)) + setDT(x) + row_length <- x[,list( length(unique(age)), length(unique(per)), length(unique(fot)))] + + expect_equal( x[,sum(pyrs)], totpyrs, tolerance = 0.001) + expect_equal( x[,sum(from0to0)], counts[1,N]) + expect_equal( x[,sum(from0to1)], counts[2,N]) + expect_equal( x[,sum(from0to2)], counts[3,N]) + #expect_equal( prod(row_length), x[,.N]) +}) + +test_that('lexpand aggre: multistate column names correct', { + + x <- lexpand(sire[dg_date < ex_date, ][0:100], + birth = bi_date, entry = dg_date, exit = ex_date, + breaks=list(fot=c(0,5,10,50,Inf), age=c(seq(0,85,5),Inf), + per = 1993:2013), + status=status, aggre = list(fot, age, per)) + + expect_equal(intersect(names(x), c('from0to0','from0to1','from0to2')), + c('from0to0','from0to1','from0to2')) +}) + + +# overlapping time lines -------------------------------------------------- + +test_that('lexpansion w/ overlapping = TRUE/FALSE produces double/undoubled pyrs', { + skip_usually() + + sire2 <- copy(sire)[dg_date < ex_date, ][1:100] + sire2[, dg_yrs := get.yrs(dg_date, "actual")] + sire2[, ex_yrs := get.yrs(ex_date, "actual")] + sire2[, bi_yrs := get.yrs(bi_date, "actual")] + sire2[, id := 1:.N] + sire2 <- sire2[rep(1:.N, each=2)] + + sire2[seq(2,.N, by=2), dg_yrs := (ex_yrs + dg_yrs)/2L] + sire2[, dg_age := dg_yrs-bi_yrs] + + x <- lexpand(sire2, birth = "bi_yrs", entry = "bi_yrs", event="dg_yrs", + exit = "ex_yrs", status="status", entry.status = 0L, id = "id", + overlapping = TRUE) + setDT(x) + expect_equal(x[, sum(lex.dur), keyby=lex.id]$V1, sire2[, sum(ex_yrs-bi_yrs), keyby=id]$V1) + + x <- lexpand(sire2, birth = "bi_yrs", entry = "bi_yrs", event="dg_yrs", + exit = "ex_yrs", status="status", entry.status = 0L, id = "id", + overlapping = FALSE) + setDT(x) + expect_equal(x[, sum(lex.dur), keyby=lex.id]$V1, + sire2[!duplicated(id), sum(ex_yrs-bi_yrs), keyby=id]$V1) +}) + + + +test_that("different specifications of time vars work with event defined and overlapping=FALSE", { + + dt <- data.table(bi_date = as.Date('1949-01-01'), + dg_date = as.Date(paste0(1999:2000, "-01-01")), + start = as.Date("1997-01-01"), + end = as.Date('2002-01-01'), + status = c(1,2), id=1) + + ## birth -> entry -> event -> exit + x1 <- lexpand(data = dt, subset = NULL, + birth = bi_date, entry = start, exit = end, event = dg_date, + id = id, overlapping = FALSE, entry.status = 0, status = status, + merge = FALSE) + expect_equal(x1$lex.dur, c(2,1,2)) + expect_equal(x1$age, c(48,50,51)) + expect_equal(x1$lex.Cst, 0:2) + expect_equal(x1$lex.Xst, c(1,2,2)) + + ## birth -> entry = event -> exit + expect_error( + lexpand(data = dt, subset = NULL, + birth = bi_date, entry = dg_date, exit = end, event = dg_date, + id = id, overlapping = FALSE, entry.status = 0, status = status, + merge = FALSE), + regexp = paste0("some rows have simultaneous 'entry' and 'event', ", + "which is not supported with overlapping = FALSE; ", + "perhaps separate them by one day?") + ) + + ## birth = entry -> event -> exit + x3 <- lexpand(data = dt, subset = NULL, + birth = bi_date, entry = bi_date, exit = end, event = dg_date, + id = id, overlapping = FALSE, entry.status = 0, status = status, + merge = FALSE) + expect_equal(x3$lex.dur, c(50,1,2)) + expect_equal(x3$age, c(0,50,51)) + expect_equal(x3$lex.Cst, 0:2) + expect_equal(x3$lex.Xst, c(1,2,2)) + + ## birth -> entry -> event = exit + expect_error( + lexpand(data = dt, subset = NULL, + birth = bi_date, entry = dg_date, exit = end, event = end, + id = id, overlapping = FALSE, entry.status = 0, status = status, + merge = FALSE), + regexp = paste0("subject\\(s\\) defined by lex.id had several rows ", + "where 'event' time had the same value, which is not ", + "supported with overlapping = FALSE; perhaps separate ", + "them by one day?") + ) + + ## birth = entry -> event -> exit + x6 <- lexpand(data = dt, subset = NULL, + birth = bi_date, entry = bi_date, exit = end, event = dg_date, + id = id, overlapping = FALSE, entry.status = 0, status = status, + merge = FALSE) + expect_equal(x6$lex.dur, c(50,1,2)) + expect_equal(x6$age, c(0,50,51)) + expect_equal(x6$lex.Cst, 0:2) + expect_equal(x6$lex.Xst, c(1,2,2)) + +}) + + +test_that("lexpand drops persons outside breaks window correctly", { + skip_usually() + + dt <- data.table(bi_date = as.Date('1949-01-01'), + dg_date = as.Date(paste0(2000, "-01-01")), + start = as.Date("1997-01-01"), + end = as.Date('2002-01-01'), + status = c(2), id=1) + + ## by age + x1 <- lexpand(data = dt, subset = NULL, + birth = bi_date, entry = start, exit = end, event = dg_date, + id = id, overlapping = FALSE, entry.status = 0, status = status, + merge = FALSE, breaks = list(age = 50:55)) + expect_equal(x1$age, c(50, 51, 52)) + + ## by period + x1 <- lexpand(data = dt, subset = NULL, + birth = bi_date, entry = start, exit = end, event = dg_date, + id = id, overlapping = FALSE, entry.status = 0, status = status, + merge = FALSE, breaks = list(per = 2000:2005)) + expect_equal(x1$per, c(2000, 2001)) + + + ## by fot + x1 <- lexpand(data = dt, subset = NULL, + birth = bi_date, entry = start, exit = end, event = dg_date, + id = id, overlapping = FALSE, entry.status = 0, status = status, + merge = FALSE, breaks = list(fot = 2:5)) + expect_equal(x1$fot, c(2, 3, 4)) +}) + diff --git a/tests/testthat/test_prevtab.R b/tests/testthat/test_prevtab.R index 2c3a018..6cd1262 100644 --- a/tests/testthat/test_prevtab.R +++ b/tests/testthat/test_prevtab.R @@ -1,21 +1,21 @@ -# context("prevtab") -# -# -# -# test_that("prevtab produces intended results", { -# -# -# mp <- data.table(popEpi::meanpop_fi) -# setnames(mp, c("year", "agegroup"), c("per", "age")) -# x <- lexpand(popEpi::sire, -# birth = bi_date, entry = dg_date, exit = ex_date, -# status = status %in% 1:2) -# pt1 <- prevtab(per ~ sex + fot, data = x, meanpop = mp, -# breaks = list(per = 2009:2014, fot = c(0,1,5,10,Inf)), -# adjust = "agegroup", weights = "internal") -# pt2 <- prevtab(per ~ sex + work, data = x, meanpop = mp, -# breaks = list(per = 2009:2014, work = c(0,1,5,10,Inf)), -# adjust = "agegroup", weights = "internal") -# -# +# context("prevtab") +# +# +# +# test_that("prevtab produces intended results", { +# +# +# mp <- data.table(popEpi::meanpop_fi) +# setnames(mp, c("year", "agegroup"), c("per", "age")) +# x <- lexpand(popEpi::sire, +# birth = bi_date, entry = dg_date, exit = ex_date, +# status = status %in% 1:2) +# pt1 <- prevtab(per ~ sex + fot, data = x, meanpop = mp, +# breaks = list(per = 2009:2014, fot = c(0,1,5,10,Inf)), +# adjust = "agegroup", weights = "internal") +# pt2 <- prevtab(per ~ sex + work, data = x, meanpop = mp, +# breaks = list(per = 2009:2014, work = c(0,1,5,10,Inf)), +# adjust = "agegroup", weights = "internal") +# +# # }) \ No newline at end of file diff --git a/tests/testthat/test_rate.R b/tests/testthat/test_rate.R index 94bef28..d5ad942 100644 --- a/tests/testthat/test_rate.R +++ b/tests/testthat/test_rate.R @@ -1,312 +1,312 @@ -context('rate') - -# simultate test data -set.seed(5) -p18 <- data.table( OBS=round(runif(36)*10), PYRS=round(runif(36)*10000), AGEGROUP=1:18, COV = rep(c(1,2), each = 18)) -set.seed(5) -p20 <- data.table( OBS=round(runif(20)*10), PYRS=round(runif(20)*10000), AGEGROUP=1:20, COV = rep(c(1,2), each = 20)) -set.seed(5) -p101 <- data.table( OBS=round(runif(101)*10), PYRS=round(runif(101)*10000), AGEGROUP=1:101, COV = rep(c(1,2), each = 101)) -p18b <- data.table(p18) -setnames(p18b, c('OBS','PYRS','AGEGROUP'), c('obs','pyrs','agegroup')) -wv <- c(.1,.1,.1,.2,.2,.2,.2,.3,.3,.4,.5,.5,.5,.4,.4,.3,.2,.1) - -# test_that("ratesplines", { -# library(data.table) -# library(splines) -# data <- ratesplines(data = p18, obs = 'OBS', pyrs = 'PYRS', print = 'COV', spline = 'AGEGROUP') -# -# -# -# -# -# -# }) - -test_that("rate works with different weights", { - w1 <- rate(data = p18, obs = 'OBS', pyrs = 'PYRS', print = 'COV', adjust = 'AGEGROUP', weights = c(1:18)) - w2 <- rate(data = p20, obs = 'OBS', pyrs = 'PYRS', print = 'COV', adjust = 'AGEGROUP', weights = 'world_2000_20of5') - w3 <- rate(data = p20, obs = 'OBS', pyrs = 'PYRS', print = 'COV', adjust = 'AGEGROUP', weights = 'cohort') - w4 <- rate(data = p20, obs = 'OBS', pyrs = 'PYRS', print = 'COV', adjust = NULL, weights = NULL) - w5 <- rate(data = p20, obs = 'OBS', pyrs = 'PYRS', print = NULL, adjust = NULL, weights = NULL) - - expect_equal(sum(w1$PYRS), p18[,sum(PYRS)]) - expect_equal(sum(w2$OBS), p20[,sum(OBS)]) - expect_equal(sum(w3$OBS), p20[,sum(OBS)]) - expect_equal(w4$rate, p20[,list(sum(OBS)/sum(PYRS)), by ='COV'][, V1]) - expect_equal(w5$rate, p20[,list(sum(OBS)/sum(PYRS))][, V1]) - expect_is(w1, 'rate') - expect_is(w2, 'rate') - expect_is(w2, 'data.frame') - if(getOption("popEpi.datatable")) { - expect_is(w2, 'data.table') - } -}) - -test_that("rate CIs and SEs are correct", { - - ci <-data.table(agegroup = c(1,2,3,4), - obs=c(4,13,8,7), - pyrs=c(96,237,105,32), - rate=c(4.2,5.5,7.6,21.9), - std.pop=c(2773,2556,1113,184)) - - a1 <- ci[,sum(obs/pyrs*std.pop)/sum(std.pop)] # oikea estimaatti - a2 <- ci[,sqrt(sum(std.pop^2*((obs/pyrs)*(1-obs/pyrs))/pyrs))/sum(std.pop)] # myös oikea tulos - - ci0 <- rate(data = ci, obs = 'obs', pyrs = 'pyrs', print = NULL, adjust = 'agegroup', weights = list(agegroup = c(2773,2556,1113,184))) - expect_equal(ci0[,SE.rate.adj], a2, tolerance=0.0005) # test - expect_equal(ci0[,rate.adj],a1) - - # another... - ci <-data.table(agegroup = c(1,2,3,4), - obs=c(4,13,8,7), - pyrs=c(960,2370,1050,320), - rate=c(4.2,5.5,7.6,21.9), - std.pop=c(2773,2556,1113,184)) - - a1 <- ci[,sum(obs/pyrs*std.pop)/sum(std.pop)] # oikea estimaatti - a2 <- ci[,sqrt(sum(std.pop^2*((obs/pyrs)*(1-obs/pyrs))/pyrs))/sum(std.pop)] # myös oikea tulos - - ci0 <- rate(data = ci, obs = 'obs', pyrs = 'pyrs', print = NULL, adjust = 'agegroup', weights = list(agegroup = c(2773,2556,1113,184))) - expect_equal(c(ci0$rate.adj.lo, ci0$rate.adj.hi),c(a1 - a2*1.96, a1 + a2*1.96), tolerance = 0.0006) - expect_gt(ci0[,SE.rate.adj], a2) # WHYY?! - expect_equal(ci0[,rate.adj],a1) # ok -}) - -test_that("makeWeightsDT works in rate", { - set.seed(5) - p18 <- data.table( OBS=round(runif(36)*10), PYRS=round(runif(36)*10000), AGEGROUP=1:18, COV = rep(c(1,2), each = 18)) - op <- c('OBS', 'PYRS') - mw1 <- makeWeightsDT(p18, adjust = substitute(factor(AGEGROUP, 1:18, 1:18)), weights = wv, print = NULL, values = list(op)) - mw2 <- makeWeightsDT(p18, adjust = NULL, weights = NULL, print = substitute(COV), values = list(op)) - - attlist <- attr( mw1, 'makeWeightsDT') - - expect_equal(c(attlist$adVars, attlist$vaVars, 'weights'), c('factor','OBS','PYRS','weights') ) - - expect_equal(mw1[,as.character(factor)], as.character(1:18)) - expect_equal(mw2[,OBS], p18[,sum(OBS), by=COV][,V1]) -}) - - -test_that("names dont cause problems", { - w1 <- rate(data = p18b, obs = 'obs', pyrs = 'pyrs', print = 'COV', adjust = 'agegroup', weights = 'nordic') - w2 <- rate(data = p18b, obs = 'obs', pyrs = 'pyrs', print = 'COV', adjust = 'agegroup', weights = 'cohort') - w3 <- rate(data = p18b, obs = obs, pyrs = pyrs, print = COV, adjust = agegroup, weights = 'cohort') - w5 <- rate(data = p18b, obs = 'obs', pyrs = 'pyrs', print = 'COV', adjust = 'agegroup', weights = 'world_1966_18of5') - w6 <- rate(data = p20, obs = 'OBS', pyrs = 'PYRS', print = 'COV', adjust = 'AGEGROUP', weights = 'world_2000_20of5') - w7a <- rate(data = p20, obs = OBS, pyrs = 'PYRS', print = 'COV', adjust = 'AGEGROUP', weights = 'world_2000_20of5') - w7b <- rate(data = p20, obs = OBS, pyrs = PYRS, print = 'COV', adjust = 'AGEGROUP', weights = 'world_2000_20of5') - - wr <- p18b[,list(obs=sum(obs),pyrs =sum(pyrs)), by ='COV'] - - expect_equal(w2$obs, w1$obs) - expect_equal(w2$pyrs, w1$pyrs) - expect_equal(wr$obs, w1$obs) - expect_equal(wr$pyrs, w1$pyrs) - expect_equal(w2, w3, check.attributes = FALSE) -}) - - -test_that("rate works with different weights an subset", { - s0 <- rate(data = p18, obs = 'OBS', pyrs = 'PYRS', print = NULL, adjust = 'AGEGROUP', weights = c(1:18)) - s0 <- rate(data = p18, obs = 'OBS', pyrs = 'PYRS', print = NULL, adjust = 'AGEGROUP', weights = c(1:18), subset = COV==1) - s1 <- rate(data = p18, obs = 'OBS', pyrs = 'PYRS', print = 'COV', adjust = 'AGEGROUP', weights = c(1:18), subset = COV==1) - s2 <- rate(data = p20, obs = 'OBS', pyrs = 'PYRS', print = 'COV', adjust = 'AGEGROUP', weights = 'world_2000_20of5', subset = COV == 2) - s3 <- rate(data = p20, obs = 'OBS', pyrs = 'PYRS', print = 'COV', adjust = 'AGEGROUP', weights = 'cohort', subset = COV==1) - s4 <- rate(data = p20, obs = 'OBS', pyrs = 'PYRS', print = 'COV', adjust = NULL, weights = NULL, subset = AGEGROUP != 1) - s5 <- rate(data = p20, obs = 'OBS', pyrs = 'PYRS', print = NULL, adjust = NULL, weights = NULL, subset = COV == 1) - - - expect_equal(sum(s1$PYRS), p18[COV==1,sum(PYRS)]) - expect_equal(sum(s2$OBS), p20[COV==2,sum(OBS)]) - expect_equal(sum(s3$OBS), p20[COV==1,sum(OBS)]) - expect_equal(s4$rate, p20[AGEGROUP!= 1,list(sum(OBS)/sum(PYRS)), by ='COV'][, V1]) - expect_equal(s5$rate, p20[COV==1,list(sum(OBS)/sum(PYRS))][, V1]) - expect_is(s3, 'rate') -}) - -test_that("rate works with different weights and syntaxies", { - - - wv <- c(.1,.1,.1,.2,.2,.2,.2,.3,.3,.4,.5,.5,.5,.4,.4,.3,.2,.1) - - s0 <- rate(data = p18, obs = 'OBS', pyrs = 'PYRS', print = NULL, adjust = NULL) - s1 <- rate(data = p18, obs = 'OBS', pyrs = 'PYRS', adjust = factor(AGEGROUP, 1:18, 1:18), weights = c(.1,.1,.1,.2,.2,.2,.2,.3,.3,.4,.5,.5,.5,.4,.4,.3,.2,.1)) - s2 <- rate(data = p18, obs = 'OBS', pyrs = 'PYRS', adjust = factor(AGEGROUP, 1:18, 1:18), weights = wv) - #s3 <- rate(data = p18, obs = 'OBS', pyrs = 'PYRS', adjust = factor(AGEGROUP, 1:18, 1:18), weights = NULL) ??? - expect_is(s1, 'rate') - - expect_equal(sum(s1$PYRS), p18[,sum(PYRS)]) - expect_equal(sum(s2$OBS), p18[,sum(OBS)]) - #expect_equal(sum(s3$OBS), p18[,sum(OBS)]) - - - # non working syntaxes - expect_error( - rate(data = p18, obs = 'OBS', pyrs = 'PYRS', print = NULL, adjust = 'AGEGROUP', weights = list(1:18)) - ) # non named list - expect_error( - rate(data = p18, obs = 'OBS', pyrs = 'PYRS', adjust = factor(AGEGROUP, 1:18, 1:18), weights = list(c(.1,.1,.1,.2,.2,.2,.2,.3,.3,.4,.5,.5,.5,.4,.4,.3,.2,.1))) - ) - expect_error( - rate(data = p18, obs = 'OBS', pyrs = 'PYRS', adjust = list(factor(AGEGROUP, 1:18, 1:18), COV), weights = list( wv, c(0.5,0.5))) - ) # a list length of 2 - expect_error( - rate(data = p18, obs = 'OBS', pyrs = 'PYRS', print = 'COV', adjust = c('AGEGROUP','COV'), weights = list(COV = c(.5,.5), AGEGROUP = 1:18)) - ) # duplicated names - - # working - s10 <- rate(data = p18, obs = 'OBS', pyrs = 'PYRS', - adjust = list(agegr = factor(AGEGROUP, 1:18, 1:18)), - weights = list(agegr = c(.1,.1,.1,.2,.2,.2,.2,.3,.3,.4,.5,.5,.5,.4,.4,.3,.2,.1))) - - s11 <- rate(data = p18, obs = 'OBS', pyrs = 'PYRS', - adjust = list(agegr = factor(AGEGROUP, 1:18, 1:18), gender = factor(COV, 1:2, 1:2)), - weights = list(agegr = c(.1,.1,.1,.2,.2,.2,.2,.3,.3,.4,.5,.5,.5,.4,.4,.3,.2,.1), gender = c(1,1))) - - s12 <- rate(data = p18, obs = 'OBS', pyrs = 'PYRS', - adjust = list(agegr = factor(AGEGROUP, 1:18, 1:18)), - weights = list(agegr = wv)) - - - s13 <- rate(data = p18, obs = 'OBS', pyrs = 'PYRS', print = NULL, adjust = 'AGEGROUP', weights = list(AGEGROUP = 1:18)) - s14a <- rate(data = p18, obs = 'OBS', pyrs = 'PYRS', print = NULL, adjust = c('AGEGROUP','COV'), weights = list(AGEGROUP = 1:18, COV = c(.5,.5))) # SAMA1 - s14b <- rate(data = p18, obs = 'OBS', pyrs = 'PYRS', print = NULL, adjust = c('AGEGROUP','COV'), weights = list(COV = c(.5,.5), AGEGROUP = 1:18)) # SAMA1 - - s16a <- rate(data = p20, obs = 'OBS', pyrs = 'PYRS', print = 'COV', adjust = 'AGEGROUP', weights = 'world_2000_20of5') # - s16b <- rate(data = p20, obs = 'OBS', pyrs = 'PYRS', print = 'COV', adjust = list(AGEGROUP), weights = 'world_2000_20of5') # - s16c <- rate(data = p20, obs = 'OBS', pyrs = 'PYRS', print = 'COV', adjust = AGEGROUP, weights = 'world_2000_20of5') # - - - - # Works - s21 <- rate(data = p18, obs = 'OBS', pyrs = 'PYRS', print = 'COV', adjust = 'AGEGROUP', weights = (1:18), subset = COV==1) - s22 <- rate(data = p20, obs = 'OBS', pyrs = 'PYRS', print = 'COV', adjust = 'AGEGROUP', weights = 'world_2000_20of5', subset = COV == 2) - s23 <- rate(data = p20, obs = 'OBS', pyrs = 'PYRS', print = 'COV', adjust = NULL, weights = NULL, subset = AGEGROUP != 1) - s24 <- rate(data = p20, obs = 'OBS', pyrs = 'PYRS', print = NULL, adjust = NULL, weights = NULL, subset = COV == 1) - - ## internal weights - s23a <- rate(data = p20, obs = 'OBS', pyrs = 'PYRS', print = 'COV', adjust = 'AGEGROUP', weights = 'internal') - s23b <- rate(data = p20, obs = 'OBS', pyrs = PYRS, print = 'COV', adjust = 'AGEGROUP', weights = 'internal') - s23c <- rate(data = p20, obs = OBS, pyrs = PYRS, print = COV, adjust = AGEGROUP, weights = 'internal') - s24a <- rate(data = p18, obs = 'OBS', pyrs = 'PYRS', adjust = list(AGEGROUP, COV), weights = "internal") - s24b <- rate(data = p18, obs = 'OBS', pyrs = 'PYRS', adjust = c('AGEGROUP', 'COV'), weights = "internal") - - ## update and getCall - s2B <- update(s0, adjust = factor(AGEGROUP, 1:18, 1:18), weights = wv) - expect_equal(data.table(s2B), data.table(s2)) - -}) - - -test_that("in rate levels of adjust are printable", { - w1 <- rate(data = p18, obs = 'OBS', pyrs = 'PYRS', print = list(COV, AG=findInterval(AGEGROUP, c(0,8,20))), adjust ='AGEGROUP', weights = 'cohort') - w2 <- rate(data = p20, obs = 'OBS', pyrs = 'PYRS', print = list(COV, ag=findInterval(AGEGROUP, c(1,8,20))), adjust ='AGEGROUP', weights = 'world_2000_20of5') - expect_equal(w1[,.N] ,4) -}) - -test_that("rate.plot doesnt throw an error", { - w1 <- rate(data = p18, obs = 'OBS', pyrs = 'PYRS', print = list(COV, AG=findInterval(AGEGROUP, c(0,8,20))), adjust ='AGEGROUP', weights = 'cohort') - w2 <- rate(data = p20, obs = 'OBS', pyrs = 'PYRS', print = list(COV), adjust ='AGEGROUP', weights = 'world_2000_20of5') - w3 <- rate(data = p20, obs = 'OBS', pyrs = 'PYRS') - expect_silent(plot(w1)) - expect_silent(plot(w2, col = 4:5, conf.int = FALSE)) - expect_silent(plot(w3, eps = 0.01)) -}) - - -test_that("rate works with missing values", { - p18c <- copy(p18) - p18c[c(1,6), PYRS := NA] - expect_warning( rate(data = p18c, obs = 'OBS', pyrs = 'PYRS', adjust = 'AGEGROUP', weights = 1:18), "Data contains 2 NA values." ) -}) - -test_that("rate standard error and CIS works", { - se <- rate(data = p18, obs = 'OBS', pyrs = 'PYRS', adjust = 'AGEGROUP', weights = 1:18) - - expect_equal(se[, SE.rate.adj] , se[, SE.rate], tolerance = 0.00002) - expect_lt(se[, rate.adj.lo], se[, rate.adj]) - expect_lt(se[, rate.adj], se[, rate.adj.hi]) - expect_lt(se[, rate.lo], se[, rate]) - expect_lt(se[, rate], se[, rate.hi]) - expect_equal(se[, sqrt(OBS/PYRS^2)], se[, SE.rate]) - expect_equal( se[, rate.adj - SE.rate.adj*2], se[,rate.adj.lo], tolerance = 0.00002) - expect_equal( se[, rate.adj + SE.rate.adj*2], se[,rate.adj.hi], tolerance = 0.00002) -}) - - - -test_that("rate_ratio works", { - - w1 <- rate(data = p18, obs = 'OBS', pyrs = 'PYRS', print = 'COV', adjust = 'AGEGROUP', weights = c(1:18)) - w2 <- rate(data = p20, obs = 'OBS', pyrs = 'PYRS', print = 'COV', adjust = 'AGEGROUP', weights = 'world_2000_20of5') - w3 <- rate(data = p18, obs = 'OBS', pyrs = 'PYRS', print = 'COV') - cru <- round((88/109818)/(78/110421),3) - - - # rate obj, one line - expect_equal( rate_ratio( w1[1], w1[2], crude = FALSE, SE.method = FALSE), - rate_ratio( w1[1], w1[2], crude = TRUE, SE.method = FALSE)) # oikein, crude ignoorattu - expect_equal( rate_ratio( w1[1], w1[2], crude = FALSE, SE.method = TRUE)$rate_ratio, - round(w1[1, rate.adj]/ w1[2, rate.adj],3)) - - expect_equal( rate_ratio( w1[1], w1[2], crude = TRUE, SE.method = TRUE), - data.frame(rate_ratio = 1.134, lower=1.087, upper=1.182)) - - # rate obj crude - suppressMessages( - expect_equal( rate_ratio( w3[1], w3[2], crude = TRUE, SE.method = TRUE), # oikein, hiljaa - rate_ratio( w3[1], w3[2], crude = FALSE, SE.method = TRUE)) # oikein, message - ) - # rate + SE - x <- c(w1[1, rate], w1[1, SE.rate]) - y <- c(w1[2, rate], w1[2, SE.rate]) - - expect_error( rate_ratio( x, y, crude = FALSE, SE.method = FALSE) ) # error, käyttäjän vastuu? - expect_error( rate_ratio( x, y, crude = TRUE, SE.method = FALSE) ) # error, käyttäjän vastuu? - expect_equal( rate_ratio( x, y, crude = FALSE, SE.method = TRUE), - rate_ratio( x, y, crude = TRUE, SE.method = TRUE)) - - # Obs + Pyrs - expect_warning( rate_ratio( c(88,78), c(109818,110421), crude = FALSE, SE.method = TRUE) ) # oikein, message - a <- rate_ratio( c(88,78), c(109818,110421), crude = TRUE, SE.method = FALSE)$rate_ratio - expect_equal(a, (c(88,78)/c(109818,110421))[1]/(c(88,78)/c(109818,110421))[2], tolerance = 0.001 ) - - # multi row rate objects - a0 <- rate_ratio( w1, w2, crude = FALSE, SE.method = TRUE) - b0 <- rate_ratio( w1[1], w2[1], crude = FALSE, SE.method = TRUE) - c0 <- rate_ratio( w1[2], w2[2], crude = FALSE, SE.method = TRUE) - - expect_equal(a0[1,], b0) - expect_equal(c(a0[2,]), c(c0)) - - #rate_ratio( w1, w2[1], crude = FALSE, SE.method = TRUE) # väärä, pitäisi tulla data.frame - #rate_ratio( w1, c(0.0005,0.00002), crude = FALSE, SE.method = TRUE) # väärä, pitäisi tulla data.frame - -}) - - -test_that("warnings and stops works properly", { - expect_error( - rate(data = p18, obs = 'OBS', pyrs = 'PYRS', print = 'COV', adjust = 'AGEGROUP', weights = list(1:18, 2:19)) - ) - expect_error( stdr.weights(c('wold00_1','world66_5')) ) - expect_error( stdr.weights(c('wold00_20of5')) ) -}) - - - -test_that("stdr.weights returns correct datasets", { - al <- c('world_1966_18of5','europe','nordic', - "world_2000_18of5","world_2000_101of1", - "world_2000_20of5") - le <- c(18,18,18,18,101,20) - expect_equal( stdr.weights(al[1])[,.N], le[1]) - expect_equal( stdr.weights(al[2])[,.N], le[2]) - expect_equal( stdr.weights(al[3])[,.N], le[3]) - expect_equal( stdr.weights(al[4])[,.N], le[4]) - expect_equal( stdr.weights(al[5])[,.N], le[5]) - expect_equal( stdr.weights(al[6])[,.N], le[6]) -}) +context('rate') + +# simultate test data +set.seed(5) +p18 <- data.table( OBS=round(runif(36)*10), PYRS=round(runif(36)*10000), AGEGROUP=1:18, COV = rep(c(1,2), each = 18)) +set.seed(5) +p20 <- data.table( OBS=round(runif(20)*10), PYRS=round(runif(20)*10000), AGEGROUP=1:20, COV = rep(c(1,2), each = 20)) +set.seed(5) +p101 <- data.table( OBS=round(runif(101)*10), PYRS=round(runif(101)*10000), AGEGROUP=1:101, COV = rep(c(1,2), each = 101)) +p18b <- data.table(p18) +setnames(p18b, c('OBS','PYRS','AGEGROUP'), c('obs','pyrs','agegroup')) +wv <- c(.1,.1,.1,.2,.2,.2,.2,.3,.3,.4,.5,.5,.5,.4,.4,.3,.2,.1) + +# test_that("ratesplines", { +# library(data.table) +# library(splines) +# data <- ratesplines(data = p18, obs = 'OBS', pyrs = 'PYRS', print = 'COV', spline = 'AGEGROUP') +# +# +# +# +# +# +# }) + +test_that("rate works with different weights", { + w1 <- rate(data = p18, obs = 'OBS', pyrs = 'PYRS', print = 'COV', adjust = 'AGEGROUP', weights = c(1:18)) + w2 <- rate(data = p20, obs = 'OBS', pyrs = 'PYRS', print = 'COV', adjust = 'AGEGROUP', weights = 'world_2000_20of5') + w3 <- rate(data = p20, obs = 'OBS', pyrs = 'PYRS', print = 'COV', adjust = 'AGEGROUP', weights = 'cohort') + w4 <- rate(data = p20, obs = 'OBS', pyrs = 'PYRS', print = 'COV', adjust = NULL, weights = NULL) + w5 <- rate(data = p20, obs = 'OBS', pyrs = 'PYRS', print = NULL, adjust = NULL, weights = NULL) + + expect_equal(sum(w1$PYRS), p18[,sum(PYRS)]) + expect_equal(sum(w2$OBS), p20[,sum(OBS)]) + expect_equal(sum(w3$OBS), p20[,sum(OBS)]) + expect_equal(w4$rate, p20[,list(sum(OBS)/sum(PYRS)), by ='COV'][, V1]) + expect_equal(w5$rate, p20[,list(sum(OBS)/sum(PYRS))][, V1]) + expect_is(w1, 'rate') + expect_is(w2, 'rate') + expect_is(w2, 'data.frame') + if(getOption("popEpi.datatable")) { + expect_is(w2, 'data.table') + } +}) + +test_that("rate CIs and SEs are correct", { + + ci <-data.table(agegroup = c(1,2,3,4), + obs=c(4,13,8,7), + pyrs=c(96,237,105,32), + rate=c(4.2,5.5,7.6,21.9), + std.pop=c(2773,2556,1113,184)) + + a1 <- ci[,sum(obs/pyrs*std.pop)/sum(std.pop)] # oikea estimaatti + a2 <- ci[,sqrt(sum(std.pop^2*((obs/pyrs)*(1-obs/pyrs))/pyrs))/sum(std.pop)] # myös oikea tulos + + ci0 <- rate(data = ci, obs = 'obs', pyrs = 'pyrs', print = NULL, adjust = 'agegroup', weights = list(agegroup = c(2773,2556,1113,184))) + expect_equal(ci0[,SE.rate.adj], a2, tolerance=0.0005) # test + expect_equal(ci0[,rate.adj],a1) + + # another... + ci <-data.table(agegroup = c(1,2,3,4), + obs=c(4,13,8,7), + pyrs=c(960,2370,1050,320), + rate=c(4.2,5.5,7.6,21.9), + std.pop=c(2773,2556,1113,184)) + + a1 <- ci[,sum(obs/pyrs*std.pop)/sum(std.pop)] # oikea estimaatti + a2 <- ci[,sqrt(sum(std.pop^2*((obs/pyrs)*(1-obs/pyrs))/pyrs))/sum(std.pop)] # myös oikea tulos + + ci0 <- rate(data = ci, obs = 'obs', pyrs = 'pyrs', print = NULL, adjust = 'agegroup', weights = list(agegroup = c(2773,2556,1113,184))) + expect_equal(c(ci0$rate.adj.lo, ci0$rate.adj.hi),c(a1 - a2*1.96, a1 + a2*1.96), tolerance = 0.0006) + expect_gt(ci0[,SE.rate.adj], a2) # WHYY?! + expect_equal(ci0[,rate.adj],a1) # ok +}) + +test_that("makeWeightsDT works in rate", { + set.seed(5) + p18 <- data.table( OBS=round(runif(36)*10), PYRS=round(runif(36)*10000), AGEGROUP=1:18, COV = rep(c(1,2), each = 18)) + op <- c('OBS', 'PYRS') + mw1 <- makeWeightsDT(p18, adjust = substitute(factor(AGEGROUP, 1:18, 1:18)), weights = wv, print = NULL, values = list(op)) + mw2 <- makeWeightsDT(p18, adjust = NULL, weights = NULL, print = substitute(COV), values = list(op)) + + attlist <- attr( mw1, 'makeWeightsDT') + + expect_equal(c(attlist$adVars, attlist$vaVars, 'weights'), c('factor','OBS','PYRS','weights') ) + + expect_equal(mw1[,as.character(factor)], as.character(1:18)) + expect_equal(mw2[,OBS], p18[,sum(OBS), by=COV][,V1]) +}) + + +test_that("names dont cause problems", { + w1 <- rate(data = p18b, obs = 'obs', pyrs = 'pyrs', print = 'COV', adjust = 'agegroup', weights = 'nordic') + w2 <- rate(data = p18b, obs = 'obs', pyrs = 'pyrs', print = 'COV', adjust = 'agegroup', weights = 'cohort') + w3 <- rate(data = p18b, obs = obs, pyrs = pyrs, print = COV, adjust = agegroup, weights = 'cohort') + w5 <- rate(data = p18b, obs = 'obs', pyrs = 'pyrs', print = 'COV', adjust = 'agegroup', weights = 'world_1966_18of5') + w6 <- rate(data = p20, obs = 'OBS', pyrs = 'PYRS', print = 'COV', adjust = 'AGEGROUP', weights = 'world_2000_20of5') + w7a <- rate(data = p20, obs = OBS, pyrs = 'PYRS', print = 'COV', adjust = 'AGEGROUP', weights = 'world_2000_20of5') + w7b <- rate(data = p20, obs = OBS, pyrs = PYRS, print = 'COV', adjust = 'AGEGROUP', weights = 'world_2000_20of5') + + wr <- p18b[,list(obs=sum(obs),pyrs =sum(pyrs)), by ='COV'] + + expect_equal(w2$obs, w1$obs) + expect_equal(w2$pyrs, w1$pyrs) + expect_equal(wr$obs, w1$obs) + expect_equal(wr$pyrs, w1$pyrs) + expect_equal(w2, w3, check.attributes = FALSE) +}) + + +test_that("rate works with different weights an subset", { + s0 <- rate(data = p18, obs = 'OBS', pyrs = 'PYRS', print = NULL, adjust = 'AGEGROUP', weights = c(1:18)) + s0 <- rate(data = p18, obs = 'OBS', pyrs = 'PYRS', print = NULL, adjust = 'AGEGROUP', weights = c(1:18), subset = COV==1) + s1 <- rate(data = p18, obs = 'OBS', pyrs = 'PYRS', print = 'COV', adjust = 'AGEGROUP', weights = c(1:18), subset = COV==1) + s2 <- rate(data = p20, obs = 'OBS', pyrs = 'PYRS', print = 'COV', adjust = 'AGEGROUP', weights = 'world_2000_20of5', subset = COV == 2) + s3 <- rate(data = p20, obs = 'OBS', pyrs = 'PYRS', print = 'COV', adjust = 'AGEGROUP', weights = 'cohort', subset = COV==1) + s4 <- rate(data = p20, obs = 'OBS', pyrs = 'PYRS', print = 'COV', adjust = NULL, weights = NULL, subset = AGEGROUP != 1) + s5 <- rate(data = p20, obs = 'OBS', pyrs = 'PYRS', print = NULL, adjust = NULL, weights = NULL, subset = COV == 1) + + + expect_equal(sum(s1$PYRS), p18[COV==1,sum(PYRS)]) + expect_equal(sum(s2$OBS), p20[COV==2,sum(OBS)]) + expect_equal(sum(s3$OBS), p20[COV==1,sum(OBS)]) + expect_equal(s4$rate, p20[AGEGROUP!= 1,list(sum(OBS)/sum(PYRS)), by ='COV'][, V1]) + expect_equal(s5$rate, p20[COV==1,list(sum(OBS)/sum(PYRS))][, V1]) + expect_is(s3, 'rate') +}) + +test_that("rate works with different weights and syntaxies", { + + + wv <- c(.1,.1,.1,.2,.2,.2,.2,.3,.3,.4,.5,.5,.5,.4,.4,.3,.2,.1) + + s0 <- rate(data = p18, obs = 'OBS', pyrs = 'PYRS', print = NULL, adjust = NULL) + s1 <- rate(data = p18, obs = 'OBS', pyrs = 'PYRS', adjust = factor(AGEGROUP, 1:18, 1:18), weights = c(.1,.1,.1,.2,.2,.2,.2,.3,.3,.4,.5,.5,.5,.4,.4,.3,.2,.1)) + s2 <- rate(data = p18, obs = 'OBS', pyrs = 'PYRS', adjust = factor(AGEGROUP, 1:18, 1:18), weights = wv) + #s3 <- rate(data = p18, obs = 'OBS', pyrs = 'PYRS', adjust = factor(AGEGROUP, 1:18, 1:18), weights = NULL) ??? + expect_is(s1, 'rate') + + expect_equal(sum(s1$PYRS), p18[,sum(PYRS)]) + expect_equal(sum(s2$OBS), p18[,sum(OBS)]) + #expect_equal(sum(s3$OBS), p18[,sum(OBS)]) + + + # non working syntaxes + expect_error( + rate(data = p18, obs = 'OBS', pyrs = 'PYRS', print = NULL, adjust = 'AGEGROUP', weights = list(1:18)) + ) # non named list + expect_error( + rate(data = p18, obs = 'OBS', pyrs = 'PYRS', adjust = factor(AGEGROUP, 1:18, 1:18), weights = list(c(.1,.1,.1,.2,.2,.2,.2,.3,.3,.4,.5,.5,.5,.4,.4,.3,.2,.1))) + ) + expect_error( + rate(data = p18, obs = 'OBS', pyrs = 'PYRS', adjust = list(factor(AGEGROUP, 1:18, 1:18), COV), weights = list( wv, c(0.5,0.5))) + ) # a list length of 2 + expect_error( + rate(data = p18, obs = 'OBS', pyrs = 'PYRS', print = 'COV', adjust = c('AGEGROUP','COV'), weights = list(COV = c(.5,.5), AGEGROUP = 1:18)) + ) # duplicated names + + # working + s10 <- rate(data = p18, obs = 'OBS', pyrs = 'PYRS', + adjust = list(agegr = factor(AGEGROUP, 1:18, 1:18)), + weights = list(agegr = c(.1,.1,.1,.2,.2,.2,.2,.3,.3,.4,.5,.5,.5,.4,.4,.3,.2,.1))) + + s11 <- rate(data = p18, obs = 'OBS', pyrs = 'PYRS', + adjust = list(agegr = factor(AGEGROUP, 1:18, 1:18), gender = factor(COV, 1:2, 1:2)), + weights = list(agegr = c(.1,.1,.1,.2,.2,.2,.2,.3,.3,.4,.5,.5,.5,.4,.4,.3,.2,.1), gender = c(1,1))) + + s12 <- rate(data = p18, obs = 'OBS', pyrs = 'PYRS', + adjust = list(agegr = factor(AGEGROUP, 1:18, 1:18)), + weights = list(agegr = wv)) + + + s13 <- rate(data = p18, obs = 'OBS', pyrs = 'PYRS', print = NULL, adjust = 'AGEGROUP', weights = list(AGEGROUP = 1:18)) + s14a <- rate(data = p18, obs = 'OBS', pyrs = 'PYRS', print = NULL, adjust = c('AGEGROUP','COV'), weights = list(AGEGROUP = 1:18, COV = c(.5,.5))) # SAMA1 + s14b <- rate(data = p18, obs = 'OBS', pyrs = 'PYRS', print = NULL, adjust = c('AGEGROUP','COV'), weights = list(COV = c(.5,.5), AGEGROUP = 1:18)) # SAMA1 + + s16a <- rate(data = p20, obs = 'OBS', pyrs = 'PYRS', print = 'COV', adjust = 'AGEGROUP', weights = 'world_2000_20of5') # + s16b <- rate(data = p20, obs = 'OBS', pyrs = 'PYRS', print = 'COV', adjust = list(AGEGROUP), weights = 'world_2000_20of5') # + s16c <- rate(data = p20, obs = 'OBS', pyrs = 'PYRS', print = 'COV', adjust = AGEGROUP, weights = 'world_2000_20of5') # + + + + # Works + s21 <- rate(data = p18, obs = 'OBS', pyrs = 'PYRS', print = 'COV', adjust = 'AGEGROUP', weights = (1:18), subset = COV==1) + s22 <- rate(data = p20, obs = 'OBS', pyrs = 'PYRS', print = 'COV', adjust = 'AGEGROUP', weights = 'world_2000_20of5', subset = COV == 2) + s23 <- rate(data = p20, obs = 'OBS', pyrs = 'PYRS', print = 'COV', adjust = NULL, weights = NULL, subset = AGEGROUP != 1) + s24 <- rate(data = p20, obs = 'OBS', pyrs = 'PYRS', print = NULL, adjust = NULL, weights = NULL, subset = COV == 1) + + ## internal weights + s23a <- rate(data = p20, obs = 'OBS', pyrs = 'PYRS', print = 'COV', adjust = 'AGEGROUP', weights = 'internal') + s23b <- rate(data = p20, obs = 'OBS', pyrs = PYRS, print = 'COV', adjust = 'AGEGROUP', weights = 'internal') + s23c <- rate(data = p20, obs = OBS, pyrs = PYRS, print = COV, adjust = AGEGROUP, weights = 'internal') + s24a <- rate(data = p18, obs = 'OBS', pyrs = 'PYRS', adjust = list(AGEGROUP, COV), weights = "internal") + s24b <- rate(data = p18, obs = 'OBS', pyrs = 'PYRS', adjust = c('AGEGROUP', 'COV'), weights = "internal") + + ## update and getCall + s2B <- update(s0, adjust = factor(AGEGROUP, 1:18, 1:18), weights = wv) + expect_equal(data.table(s2B), data.table(s2)) + +}) + + +test_that("in rate levels of adjust are printable", { + w1 <- rate(data = p18, obs = 'OBS', pyrs = 'PYRS', print = list(COV, AG=findInterval(AGEGROUP, c(0,8,20))), adjust ='AGEGROUP', weights = 'cohort') + w2 <- rate(data = p20, obs = 'OBS', pyrs = 'PYRS', print = list(COV, ag=findInterval(AGEGROUP, c(1,8,20))), adjust ='AGEGROUP', weights = 'world_2000_20of5') + expect_equal(w1[,.N] ,4) +}) + +test_that("rate.plot doesnt throw an error", { + w1 <- rate(data = p18, obs = 'OBS', pyrs = 'PYRS', print = list(COV, AG=findInterval(AGEGROUP, c(0,8,20))), adjust ='AGEGROUP', weights = 'cohort') + w2 <- rate(data = p20, obs = 'OBS', pyrs = 'PYRS', print = list(COV), adjust ='AGEGROUP', weights = 'world_2000_20of5') + w3 <- rate(data = p20, obs = 'OBS', pyrs = 'PYRS') + expect_silent(plot(w1)) + expect_silent(plot(w2, col = 4:5, conf.int = FALSE)) + expect_silent(plot(w3, eps = 0.01)) +}) + + +test_that("rate works with missing values", { + p18c <- copy(p18) + p18c[c(1,6), PYRS := NA] + expect_warning( rate(data = p18c, obs = 'OBS', pyrs = 'PYRS', adjust = 'AGEGROUP', weights = 1:18), "Data contains 2 NA values." ) +}) + +test_that("rate standard error and CIS works", { + se <- rate(data = p18, obs = 'OBS', pyrs = 'PYRS', adjust = 'AGEGROUP', weights = 1:18) + + expect_equal(se[, SE.rate.adj] , se[, SE.rate], tolerance = 0.00002) + expect_lt(se[, rate.adj.lo], se[, rate.adj]) + expect_lt(se[, rate.adj], se[, rate.adj.hi]) + expect_lt(se[, rate.lo], se[, rate]) + expect_lt(se[, rate], se[, rate.hi]) + expect_equal(se[, sqrt(OBS/PYRS^2)], se[, SE.rate]) + expect_equal( se[, rate.adj - SE.rate.adj*2], se[,rate.adj.lo], tolerance = 0.00002) + expect_equal( se[, rate.adj + SE.rate.adj*2], se[,rate.adj.hi], tolerance = 0.00002) +}) + + + +test_that("rate_ratio works", { + + w1 <- rate(data = p18, obs = 'OBS', pyrs = 'PYRS', print = 'COV', adjust = 'AGEGROUP', weights = c(1:18)) + w2 <- rate(data = p20, obs = 'OBS', pyrs = 'PYRS', print = 'COV', adjust = 'AGEGROUP', weights = 'world_2000_20of5') + w3 <- rate(data = p18, obs = 'OBS', pyrs = 'PYRS', print = 'COV') + cru <- round((88/109818)/(78/110421),3) + + + # rate obj, one line + expect_equal( rate_ratio( w1[1], w1[2], crude = FALSE, SE.method = FALSE), + rate_ratio( w1[1], w1[2], crude = TRUE, SE.method = FALSE)) # oikein, crude ignoorattu + expect_equal( rate_ratio( w1[1], w1[2], crude = FALSE, SE.method = TRUE)$rate_ratio, + round(w1[1, rate.adj]/ w1[2, rate.adj],3)) + + expect_equal( rate_ratio( w1[1], w1[2], crude = TRUE, SE.method = TRUE), + data.frame(rate_ratio = 1.134, lower=1.087, upper=1.182)) + + # rate obj crude + suppressMessages( + expect_equal( rate_ratio( w3[1], w3[2], crude = TRUE, SE.method = TRUE), # oikein, hiljaa + rate_ratio( w3[1], w3[2], crude = FALSE, SE.method = TRUE)) # oikein, message + ) + # rate + SE + x <- c(w1[1, rate], w1[1, SE.rate]) + y <- c(w1[2, rate], w1[2, SE.rate]) + + expect_error( rate_ratio( x, y, crude = FALSE, SE.method = FALSE) ) # error, käyttäjän vastuu? + expect_error( rate_ratio( x, y, crude = TRUE, SE.method = FALSE) ) # error, käyttäjän vastuu? + expect_equal( rate_ratio( x, y, crude = FALSE, SE.method = TRUE), + rate_ratio( x, y, crude = TRUE, SE.method = TRUE)) + + # Obs + Pyrs + expect_warning( rate_ratio( c(88,78), c(109818,110421), crude = FALSE, SE.method = TRUE) ) # oikein, message + a <- rate_ratio( c(88,78), c(109818,110421), crude = TRUE, SE.method = FALSE)$rate_ratio + expect_equal(a, (c(88,78)/c(109818,110421))[1]/(c(88,78)/c(109818,110421))[2], tolerance = 0.001 ) + + # multi row rate objects + a0 <- rate_ratio( w1, w2, crude = FALSE, SE.method = TRUE) + b0 <- rate_ratio( w1[1], w2[1], crude = FALSE, SE.method = TRUE) + c0 <- rate_ratio( w1[2], w2[2], crude = FALSE, SE.method = TRUE) + + expect_equal(a0[1,], b0) + expect_equal(c(a0[2,]), c(c0)) + + #rate_ratio( w1, w2[1], crude = FALSE, SE.method = TRUE) # väärä, pitäisi tulla data.frame + #rate_ratio( w1, c(0.0005,0.00002), crude = FALSE, SE.method = TRUE) # väärä, pitäisi tulla data.frame + +}) + + +test_that("warnings and stops works properly", { + expect_error( + rate(data = p18, obs = 'OBS', pyrs = 'PYRS', print = 'COV', adjust = 'AGEGROUP', weights = list(1:18, 2:19)) + ) + expect_error( stdr.weights(c('wold00_1','world66_5')) ) + expect_error( stdr.weights(c('wold00_20of5')) ) +}) + + + +test_that("stdr.weights returns correct datasets", { + al <- c('world_1966_18of5','europe','nordic', + "world_2000_18of5","world_2000_101of1", + "world_2000_20of5") + le <- c(18,18,18,18,101,20) + expect_equal( stdr.weights(al[1])[,.N], le[1]) + expect_equal( stdr.weights(al[2])[,.N], le[2]) + expect_equal( stdr.weights(al[3])[,.N], le[3]) + expect_equal( stdr.weights(al[4])[,.N], le[4]) + expect_equal( stdr.weights(al[5])[,.N], le[5]) + expect_equal( stdr.weights(al[6])[,.N], le[6]) +}) diff --git a/tests/testthat/test_relpois_mean_curve.R b/tests/testthat/test_relpois_mean_curve.R index 503fe50..f3652ec 100644 --- a/tests/testthat/test_relpois_mean_curve.R +++ b/tests/testthat/test_relpois_mean_curve.R @@ -1,68 +1,68 @@ -context("test rpcurve vs. survtab congruence") - -test_that("rpcurve and survtab e2 are approximately congruent", { - skip_on_cran() - - sire2 <- copy(sire)[dg_date < ex_date, ] - sire2[, agegr := cut(dg_age, breaks = c(0,45,70,Inf))] - - fb <- c(0,3/12,6/12,1:8,10) - x <- lexpand(sire2, birth = bi_date, entry = dg_date, exit = ex_date, - status = status %in% 1:2, - fot=fb, pophaz=data.table(popEpi::popmort)) - setDT(x) - setattr(x, "class", c("Lexis", "data.table", "data.frame")) - rp <- relpois(x, formula = lex.Xst %in% 1:2 ~ -1 + FOT+agegr) - mc <- rpcurve(rp) - - x[, pop.haz := NULL] - pm <- data.table(popEpi::popmort) - setnames(pm, c("year", "agegroup"), c("per", "age")) - w <- as.numeric(table(x$agegr)) - st <- survtab(Surv(fot, lex.Xst) ~ adjust(agegr), - pophaz = pm, weights = w, - relsurv.method = "e2", - data= x, breaks = list(fot = seq(0, 10, 1/12))) - setDT(mc) - setDT(st) - - expect_equal(st[Tstop %in% fb]$r.e2.as, mc$est, tolerance=0.0136, scale=1L) - - ## added old results on 2016-03-19, - ## ref = 4feb1ca37489737332cebf33d24550a9951a7630 - old_res <- c(0.9253749, 0.8801775, 0.8123319, 0.7237591, 0.6679470, - 0.6315218, 0.6035761, 0.5826368, 0.5645760, 0.5519045, 0.5368186) - - expect_equal(old_res, mc$est, tolerance=1e-5, scale=1L) -}) - - - -# comparison with flexsurv; maybe not needed ---------------------------------- -# library(flexsurv) -# sire2 <- lexpand(sire2, fot=c(0, 10), status) -# sire2[, year := year(ex_date)] -# sire2[, agegroup := as.integer(as.integer(ex_date-bi_date)/365.25)] -# sire2[agegroup > 100, agegroup := 100L] -# -# sire2 <- data.table:::merge.data.table(sire2, popmort, all.x=F, all.y=F, by=c("sex","year","agegroup")) -# sire2[lex.Xst == 0, haz := 0] ## not really needed, nothing changes even when it works -# -# ## spline model does not work with bhazard (nothing changes) -# fl <- flexsurvspline(Surv(lex.dur, lex.Xst %in% 1:2) ~ agegr, data=sire2, k = 2, bhazard=sire2$haz) -# ## this works -# fl <- flexsurvreg(Surv(lex.dur, lex.Xst %in% 1:2) ~ agegr, data=sire2, dist="gengamma", bhazard=sire2$haz) -# su <- summary.flexsurvreg(fl, newdata = sire2, ci = FALSE, t = fb, B = 0) -# su <- rbindlist(su) -# su <- su[, list(netsurv = mean(est)), by=time] -# -# plot(netsurv~time, data=su, type="l") -# lines(est~Tstop, data=mc, col="red") -# # lines(lo~Tstop, data=mc, col="red") -# # lines(hi~Tstop, data=mc, col="red") -# lines(st, "r.e2.as", col="blue", conf.int=FALSE) - - - - - +context("test rpcurve vs. survtab congruence") + +test_that("rpcurve and survtab e2 are approximately congruent", { + skip_on_cran() + + sire2 <- copy(sire)[dg_date < ex_date, ] + sire2[, agegr := cut(dg_age, breaks = c(0,45,70,Inf))] + + fb <- c(0,3/12,6/12,1:8,10) + x <- lexpand(sire2, birth = bi_date, entry = dg_date, exit = ex_date, + status = status %in% 1:2, + fot=fb, pophaz=data.table(popEpi::popmort)) + setDT(x) + setattr(x, "class", c("Lexis", "data.table", "data.frame")) + rp <- relpois(x, formula = lex.Xst %in% 1:2 ~ -1 + FOT+agegr) + mc <- rpcurve(rp) + + x[, pop.haz := NULL] + pm <- data.table(popEpi::popmort) + setnames(pm, c("year", "agegroup"), c("per", "age")) + w <- as.numeric(table(x$agegr)) + st <- survtab(Surv(fot, lex.Xst) ~ adjust(agegr), + pophaz = pm, weights = w, + relsurv.method = "e2", + data= x, breaks = list(fot = seq(0, 10, 1/12))) + setDT(mc) + setDT(st) + + expect_equal(st[Tstop %in% fb]$r.e2.as, mc$est, tolerance=0.0136, scale=1L) + + ## added old results on 2016-03-19, + ## ref = 4feb1ca37489737332cebf33d24550a9951a7630 + old_res <- c(0.9253749, 0.8801775, 0.8123319, 0.7237591, 0.6679470, + 0.6315218, 0.6035761, 0.5826368, 0.5645760, 0.5519045, 0.5368186) + + expect_equal(old_res, mc$est, tolerance=1e-5, scale=1L) +}) + + + +# comparison with flexsurv; maybe not needed ---------------------------------- +# library(flexsurv) +# sire2 <- lexpand(sire2, fot=c(0, 10), status) +# sire2[, year := year(ex_date)] +# sire2[, agegroup := as.integer(as.integer(ex_date-bi_date)/365.25)] +# sire2[agegroup > 100, agegroup := 100L] +# +# sire2 <- data.table:::merge.data.table(sire2, popmort, all.x=F, all.y=F, by=c("sex","year","agegroup")) +# sire2[lex.Xst == 0, haz := 0] ## not really needed, nothing changes even when it works +# +# ## spline model does not work with bhazard (nothing changes) +# fl <- flexsurvspline(Surv(lex.dur, lex.Xst %in% 1:2) ~ agegr, data=sire2, k = 2, bhazard=sire2$haz) +# ## this works +# fl <- flexsurvreg(Surv(lex.dur, lex.Xst %in% 1:2) ~ agegr, data=sire2, dist="gengamma", bhazard=sire2$haz) +# su <- summary.flexsurvreg(fl, newdata = sire2, ci = FALSE, t = fb, B = 0) +# su <- rbindlist(su) +# su <- su[, list(netsurv = mean(est)), by=time] +# +# plot(netsurv~time, data=su, type="l") +# lines(est~Tstop, data=mc, col="red") +# # lines(lo~Tstop, data=mc, col="red") +# # lines(hi~Tstop, data=mc, col="red") +# lines(st, "r.e2.as", col="blue", conf.int=FALSE) + + + + + diff --git a/tests/testthat/test_sir.R b/tests/testthat/test_sir.R index 2e514d9..f3901f7 100644 --- a/tests/testthat/test_sir.R +++ b/tests/testthat/test_sir.R @@ -1,441 +1,441 @@ -context("SIR") - - -test_that("SIR w/ coh=ref=popEpi::sire", { - ## don't skip on CRAN - sire2 <- copy(popEpi::sire) - sire2[, agegroup := cut(dg_age, breaks = c(0:17*5, Inf))] - levels(sire2$agegroup) <- 1:18 - sire2[, ex_y := year(ex_date)] - sire2[, dur := as.integer(ex_date-dg_date)/365.242199] - ltre <- ltable(sire2[status != 0], c("ex_y", "agegroup"), - expr = list(obs=.N, pyrs=sum(dur))) - setDT(ltre) - ltre[is.na(pyrs), pyrs := 0] - - sibr2 <- copy(popEpi::sibr) - sibr2[, agegroup := cut(dg_age, breaks = c(0:17*5, Inf))] - levels(sibr2$agegroup) <- 1:18 - sibr2[, ex_y := year(ex_date)] - sibr2[, dur := as.integer(ex_date-dg_date)/365.242199] - ltbr <- ltable(sibr2[status != 0], c("ex_y", "agegroup"), - expr = list(obs=.N, pyrs=sum(dur))) - setDT(ltbr) - ltbr[is.na(pyrs), pyrs := 0] - - - - sl <- sir(coh.data=ltre, coh.obs="obs", coh.pyrs="pyrs", - ref.data=ltre, ref.obs="obs", ref.pyrs="pyrs", - adjust= c("agegroup","ex_y")) - plot(sl) - ## SIR w/ coh=ref=popEpi::sire - ## don't skip on CRAN - expect_equal(sl$sir, 1) - expect_equal(sl$pyrs, 13783.81, tolerance=0.01) - expect_equal(sl$expected, 4595) - expect_equal(sl$observed, 4595) - - - sl <- sir(coh.data=ltre, coh.obs="obs", coh.pyrs="pyrs", - ref.data=ltbr, ref.obs="obs", ref.pyrs="pyrs", - adjust= c("agegroup","ex_y")) - - ## SIR w/ coh=ref=popEpi::sire" - expect_equal(sl$sir, 1.39, tolerance=0.01) - expect_equal(sl$pyrs, 13783.81, tolerance=0.01) - expect_equal(sl$expected, 3305.04, tolerance=0.01) - expect_equal(sl$observed, 4595) -}) - - - -# SIR mstate, subset + lexpand aggre --------------------------------------- - -# same model -c <- lexpand( popEpi::sire[dg_date0, labels=c("Alive","Dead")), - entry.status = factor(0, 0:1, labels = c("Alive", "Dead")), - data = nickel ) - - set.seed(1337) - lex$lex.id <- sample(paste0("abcd_", 1:nrow(lex)), size = nrow(lex)) - - lex_copy <- copy(lex) - - BL <- list( - per = 1920:1990, - age = 0:100 - ) - - epi_s1 <- splitLexis(lex, breaks = BL$per, time.scale = "per") - epi_s2 <- splitLexis(epi_s1, breaks = BL$age, time.scale = "age") - - pop_s1 <- splitLexisDT(lex, breaks=BL$per, timeScale="per", drop = FALSE) - pop_s2 <- splitLexisDT(pop_s1, breaks=BL$age, timeScale="age" , drop = FALSE) - - pop_sm <- splitMulti(lex, breaks = BL, drop = FALSE) - - expect_equal( - setDT(epi_s2), setDT(pop_sm), check.attributes = FALSE - ) - expect_equal( - setDT(epi_s2), setDT(pop_s2), check.attributes = FALSE - ) - expect_identical( - lex, lex_copy - ) - -}) - - - - - - - - - +context("Compare splitMulti results with splitLexis results") + + + + + +test_that("splitMulti and splitLexis are congruent", { + expect_identical(1L, 1L) ## to trigger testing... + skip_usually() + library(Epi) + + sire2 <- copy(sire) + sire2[, dg_yrs := get.yrs(dg_date, "actual")] + sire2[, ex_yrs := get.yrs(ex_date, "actual")] + sire2[, bi_yrs := get.yrs(bi_date, "actual")] + sire2[, id := 1:.N] + + BL1 <- list(fot = 0:5, per = 1990:1995, age = c(0, 60, Inf)) + + BL2 <- list(fot = c(10,Inf), age = seq(0,150,5)) + + BL3 <- list(fot = c(5, Inf), per = c(1900, 2100), age = c(25,100)) + + BL4 <- list(fot = 0:10) + + BL5 <- list(fot = 5:10) + + BL6 <- list(per = 1990:2000, age = c(50,70)) + + + BL <- list(BL1, BL2, BL3, BL4, BL5, BL6) + + x <- Lexis(data=sire2[dg_date < ex_date], entry=list(fot=0, per=dg_yrs, age=dg_age), + exit=list(per=ex_yrs), merge=TRUE, exit.status=1L, entry.status = 0L) + setDT(x) + setattr(x, "class", c("Lexis", "data.table", "data.frame")) + + + # one row per id --------------------------------------------------------------- + + test_that("splitMulti and splitLexis congruent with one row per id", { + expect_identical(1L, 1L) ## to trigger testing... + for (sc in seq_along(BL)) { + compareSMWithEpi(x, BL[[sc]]) + } + }) + + + + # multiple rows per id --------------------------------------------------------- + + sire2 <- sire2[rep(1:.N, each = 2)] + + x <- Lexis(data=sire2[dg_date < ex_date], entry=list(fot=0, per=dg_yrs, age=dg_age), + exit=list(per=ex_yrs), merge=TRUE, exit.status=1L, entry.status = 0L, id = id) + setDT(x) + setattr(x, "class", c("Lexis", "data.table", "data.frame")) + + for (sc in seq_along(BL)) { + test_that(paste0("splitLexisDT and splitLexis congruent with multiple rows per id with breaks no. ", sc), { + expect_identical(1L, 1L) ## to trigger testing... + compareSMWithEpi(x, BL[[sc]]) + }) + } + + # multistate using Lexis ----------------------------------------------------- + + + sire2[, "EX" := factor(status, levels = 0:2, ordered = TRUE)] + sire2[, "EN" := factor(0L, levels = 0:2, ordered = TRUE)] + levels(sire2$EX) <- levels(sire2$EN) <- c("ok", "dead", "dead") + + x <- Lexis(data=sire2[dg_date < ex_date & !duplicated(id)], entry=list(fot=0, per=bi_yrs, age=0), + exit=list(per=ex_yrs), merge=TRUE, exit.status=EX, entry.status = EN, id = id) + + x <- cutLexis(x, cut = x$dg_yrs, timescale = "per", new.state = "sick", precursor.state = "ok") + setDT(x) + setattr(x, "class", c("Lexis", "data.table", "data.frame")) + + BL[[1L]] <- NULL ## this would drop all rows in split data + + for (sc in seq_along(BL)) { + test_that(paste0("splitLexisDT and splitLexis congruent with multiple Lexis states per id using breaks list no. ", sc), { + expect_identical(1L, 1L) ## to trigger testing... + compareSMWithEpi(x, BL[[sc]]) + }) + } + + +}) + + + + +test_that("splitMulti agrees with splitLexis, vol. II", { + + library("Epi") + + data(nickel, package = "Epi") + + lex <- Lexis( entry = list(age=agein, + per=agein+dob), + exit = list(age=ageout), + exit.status = factor(icd>0, labels=c("Alive","Dead")), + entry.status = factor(0, 0:1, labels = c("Alive", "Dead")), + data = nickel ) + + set.seed(1337) + lex$lex.id <- sample(paste0("abcd_", 1:nrow(lex)), size = nrow(lex)) + + lex_copy <- copy(lex) + + BL <- list( + per = 1920:1990, + age = 0:100 + ) + + epi_s1 <- splitLexis(lex, breaks = BL$per, time.scale = "per") + epi_s2 <- splitLexis(epi_s1, breaks = BL$age, time.scale = "age") + + pop_s1 <- splitLexisDT(lex, breaks=BL$per, timeScale="per", drop = FALSE) + pop_s2 <- splitLexisDT(pop_s1, breaks=BL$age, timeScale="age" , drop = FALSE) + + pop_sm <- splitMulti(lex, breaks = BL, drop = FALSE) + + expect_equal( + setDT(epi_s2), setDT(pop_sm), check.attributes = FALSE + ) + expect_equal( + setDT(epi_s2), setDT(pop_s2), check.attributes = FALSE + ) + expect_identical( + lex, lex_copy + ) + +}) + + + + + + + + + diff --git a/tests/testthat/test_splitting_attributes.R b/tests/testthat/test_splitting_attributes.R index 9e1319f..ac55d74 100644 --- a/tests/testthat/test_splitting_attributes.R +++ b/tests/testthat/test_splitting_attributes.R @@ -1,94 +1,94 @@ -context("Attributes of data split by popEpi funs") - - - - - -test_that("popEpi splitters produce correct attributes", { - - library("Epi") - library("data.table") - - sire <- setDT(copy(popEpi::sire)) - sire[, "dg_yrs" := get.yrs(dg_date, "actual")] - sire[, "ex_yrs" := get.yrs(ex_date, "actual")] - sire[, "bi_yrs" := get.yrs(bi_date, "actual")] - sire[, "id" := 1:.N] - - sire <- Lexis( - data = sire[dg_date < ex_date], - entry = list(fot=0, per=dg_yrs, age=dg_age), - exit = list(per=ex_yrs), - merge = TRUE, - exit.status = 1L, entry.status = 0L - ) - BL <- list(age = c(0, 50), fot = c(0, 5)) - sm_1 <- splitMulti(sire, breaks = BL["fot"], drop = TRUE) - sm_1 <- splitMulti(sm_1, breaks = BL["age"], drop = TRUE) - - sm_2 <- splitMulti(sire, breaks = BL, drop = TRUE) - - sl <- splitLexisDT(sire, breaks = BL$fot, timeScale = "fot", drop = TRUE) - sl <- splitLexisDT(sl, breaks = BL$age, timeScale = "age", drop = TRUE) - - lp <- lexpand(data.table(sire)[, .(bi_yrs, dg_yrs, ex_yrs, status)], - birth = bi_yrs, entry = dg_yrs, exit = ex_yrs, status = status, - breaks = BL) - - lapply(list(sm_1, sm_2, sl, lp), function(lex) { - expect_identical( - attr(lex, "breaks"), - list(fot = c(0, 5), per = NULL, age = c(0, 50)) - ) - expect_identical( - attr(lex, "time.scales"), - c("fot", "per", "age") - ) - }) - -}) - - - - - -test_that("popEpi splitters retain time.since attribute", { - ## based on simLexis example from Epi 2.19 - library("Epi") - library("data.table") - library("splines") - - data("DMlate", package = "Epi") - dml <- Lexis( entry = list(Per=dodm, Age=dodm-dobth, DMdur=0 ), - exit = list(Per=dox), - exit.status = factor(!is.na(dodth),labels=c("DM","Dead")), - data = DMlate[runif(nrow(DMlate))<0.1,] ) - # Split follow-up at insulin, introduce a new timescale, - # and split non-precursor states - dmi <- cutLexis( dml, cut = dml$doins, - pre = "DM", - new.state = "Ins", - new.scale = "t.Ins", - split.states = TRUE ) - - # Split the follow in 1-year intervals for modelling - Si <- splitLexis( dmi, 0:30/2, "DMdur" ) - - - sldt <- splitLexisDT(dmi, breaks = 0:30/2, timeScale = "DMdur") - sm <- splitMulti(dmi, breaks = list(DMdur = 0:30/2)) - - lex_attr_nms <- c("time.since", "breaks", "time.scales") - expect_identical( - attributes(Si)[lex_attr_nms], attributes(sm)[lex_attr_nms] - ) - expect_identical( - attributes(Si)[lex_attr_nms], attributes(sldt)[lex_attr_nms] - ) - -}) - - - - - +context("Attributes of data split by popEpi funs") + + + + + +test_that("popEpi splitters produce correct attributes", { + + library("Epi") + library("data.table") + + sire <- setDT(copy(popEpi::sire)) + sire[, "dg_yrs" := get.yrs(dg_date, "actual")] + sire[, "ex_yrs" := get.yrs(ex_date, "actual")] + sire[, "bi_yrs" := get.yrs(bi_date, "actual")] + sire[, "id" := 1:.N] + + sire <- Lexis( + data = sire[dg_date < ex_date], + entry = list(fot=0, per=dg_yrs, age=dg_age), + exit = list(per=ex_yrs), + merge = TRUE, + exit.status = 1L, entry.status = 0L + ) + BL <- list(age = c(0, 50), fot = c(0, 5)) + sm_1 <- splitMulti(sire, breaks = BL["fot"], drop = TRUE) + sm_1 <- splitMulti(sm_1, breaks = BL["age"], drop = TRUE) + + sm_2 <- splitMulti(sire, breaks = BL, drop = TRUE) + + sl <- splitLexisDT(sire, breaks = BL$fot, timeScale = "fot", drop = TRUE) + sl <- splitLexisDT(sl, breaks = BL$age, timeScale = "age", drop = TRUE) + + lp <- lexpand(data.table(sire)[, .(bi_yrs, dg_yrs, ex_yrs, status)], + birth = bi_yrs, entry = dg_yrs, exit = ex_yrs, status = status, + breaks = BL) + + lapply(list(sm_1, sm_2, sl, lp), function(lex) { + expect_identical( + attr(lex, "breaks"), + list(fot = c(0, 5), per = NULL, age = c(0, 50)) + ) + expect_identical( + attr(lex, "time.scales"), + c("fot", "per", "age") + ) + }) + +}) + + + + + +test_that("popEpi splitters retain time.since attribute", { + ## based on simLexis example from Epi 2.19 + library("Epi") + library("data.table") + library("splines") + + data("DMlate", package = "Epi") + dml <- Lexis( entry = list(Per=dodm, Age=dodm-dobth, DMdur=0 ), + exit = list(Per=dox), + exit.status = factor(!is.na(dodth),labels=c("DM","Dead")), + data = DMlate[runif(nrow(DMlate))<0.1,] ) + # Split follow-up at insulin, introduce a new timescale, + # and split non-precursor states + dmi <- cutLexis( dml, cut = dml$doins, + pre = "DM", + new.state = "Ins", + new.scale = "t.Ins", + split.states = TRUE ) + + # Split the follow in 1-year intervals for modelling + Si <- splitLexis( dmi, 0:30/2, "DMdur" ) + + + sldt <- splitLexisDT(dmi, breaks = 0:30/2, timeScale = "DMdur") + sm <- splitMulti(dmi, breaks = list(DMdur = 0:30/2)) + + lex_attr_nms <- c("time.since", "breaks", "time.scales") + expect_identical( + attributes(Si)[lex_attr_nms], attributes(sm)[lex_attr_nms] + ) + expect_identical( + attributes(Si)[lex_attr_nms], attributes(sldt)[lex_attr_nms] + ) + +}) + + + + + diff --git a/tests/testthat/test_splitting_breaks.R b/tests/testthat/test_splitting_breaks.R index 75bb53b..a050548 100644 --- a/tests/testthat/test_splitting_breaks.R +++ b/tests/testthat/test_splitting_breaks.R @@ -1,80 +1,80 @@ -context("breaks attributes resulting from splitting") - - -test_that("splitMulti produces intended breaks list", { - skip_on_cran() - x <- data.table(popEpi::sibr) - x <- x[dg_date < ex_date & bi_date < dg_date] - - x <- Lexis( - data = x, - entry = list( - per = get.yrs(dg_date), fot = 0.0, - age = get.yrs(dg_date)-get.yrs(bi_date) - ), - exit = list( - per = get.yrs(ex_date) - ), entry.status = 0L, exit.status = 1L - ) - forceLexisDT(x, breaks = get_breaks(x), allScales = timeScales(x)) - - BL <- list(fot = 2:8, per = 1990:2000, age = seq(0,100, 10)) - xx <- splitMulti(x, breaks = BL, drop = TRUE) - - BL2 <- list(fot = 4:7, per = 1991:1999, age = seq(50,70, 10)) - xxx <- splitMulti(xx, breaks = BL, drop = TRUE) - - expect_equal(breaks(xx, "fot"), BL$fot) - expect_equal(breaks(xx, "per"), BL$per) - expect_equal(breaks(xx, "age"), BL$age) - - expect_equal(breaks(xxx, "fot"), BL$fot) - expect_equal(breaks(xxx, "per"), BL$per) - expect_equal(breaks(xxx, "age"), BL$age) -}) - - - -test_that("splitLexisDT produces intended breaks list", { - skip_on_cran() - x <- data.table(popEpi::sibr)[dg_date < ex_date, ] - - x <- Lexis( - data = x, - entry = list( - per = get.yrs(dg_date), fot = 0.0, - age = get.yrs(dg_date)-get.yrs(bi_date) - ), - exit = list( - per = get.yrs(ex_date) - ), entry.status = 0L, exit.status = 1L - ) - forceLexisDT(x, breaks = get_breaks(x), allScales = timeScales(x)) - - br <- 2:10 - xx <- splitLexisDT(x, breaks = br, timeScale = "fot", drop = TRUE) - - br2 <- 2:12 - xxx <- splitLexisDT(xx, breaks = br, timeScale = "fot", drop = TRUE) - - expect_equal(breaks(xx, "fot"), br) - expect_equal(breaks(xxx, "fot"), unique(br, br2)) - - - br <- 0:8 - xx <- splitLexisDT(x, breaks = br, timeScale = "fot", drop = FALSE) - - br2 <- 2:10 - xxx <- splitLexisDT(xx, breaks = br2, timeScale = "fot", drop = FALSE) - - br3 <- seq(9, 12, 0.5) - xxxx <- splitLexisDT(xxx, breaks = br3, timeScale = "fot", drop = FALSE) - - expect_equal(breaks(xx, "fot"), br) - expect_equal(breaks(xxx, "fot"), unique(c(br, br2))) - expect_equal(breaks(xxxx, "fot"), sort(unique(c(br, br3)))) -}) - - - - +context("breaks attributes resulting from splitting") + + +test_that("splitMulti produces intended breaks list", { + skip_on_cran() + x <- data.table(popEpi::sibr) + x <- x[dg_date < ex_date & bi_date < dg_date] + + x <- Lexis( + data = x, + entry = list( + per = get.yrs(dg_date), fot = 0.0, + age = get.yrs(dg_date)-get.yrs(bi_date) + ), + exit = list( + per = get.yrs(ex_date) + ), entry.status = 0L, exit.status = 1L + ) + forceLexisDT(x, breaks = get_breaks(x), allScales = timeScales(x)) + + BL <- list(fot = 2:8, per = 1990:2000, age = seq(0,100, 10)) + xx <- splitMulti(x, breaks = BL, drop = TRUE) + + BL2 <- list(fot = 4:7, per = 1991:1999, age = seq(50,70, 10)) + xxx <- splitMulti(xx, breaks = BL, drop = TRUE) + + expect_equal(breaks(xx, "fot"), BL$fot) + expect_equal(breaks(xx, "per"), BL$per) + expect_equal(breaks(xx, "age"), BL$age) + + expect_equal(breaks(xxx, "fot"), BL$fot) + expect_equal(breaks(xxx, "per"), BL$per) + expect_equal(breaks(xxx, "age"), BL$age) +}) + + + +test_that("splitLexisDT produces intended breaks list", { + skip_on_cran() + x <- data.table(popEpi::sibr)[dg_date < ex_date, ] + + x <- Lexis( + data = x, + entry = list( + per = get.yrs(dg_date), fot = 0.0, + age = get.yrs(dg_date)-get.yrs(bi_date) + ), + exit = list( + per = get.yrs(ex_date) + ), entry.status = 0L, exit.status = 1L + ) + forceLexisDT(x, breaks = get_breaks(x), allScales = timeScales(x)) + + br <- 2:10 + xx <- splitLexisDT(x, breaks = br, timeScale = "fot", drop = TRUE) + + br2 <- 2:12 + xxx <- splitLexisDT(xx, breaks = br, timeScale = "fot", drop = TRUE) + + expect_equal(breaks(xx, "fot"), br) + expect_equal(breaks(xxx, "fot"), unique(br, br2)) + + + br <- 0:8 + xx <- splitLexisDT(x, breaks = br, timeScale = "fot", drop = FALSE) + + br2 <- 2:10 + xxx <- splitLexisDT(xx, breaks = br2, timeScale = "fot", drop = FALSE) + + br3 <- seq(9, 12, 0.5) + xxxx <- splitLexisDT(xxx, breaks = br3, timeScale = "fot", drop = FALSE) + + expect_equal(breaks(xx, "fot"), br) + expect_equal(breaks(xxx, "fot"), unique(c(br, br2))) + expect_equal(breaks(xxxx, "fot"), sort(unique(c(br, br3)))) +}) + + + + diff --git a/tests/testthat/test_splitting_randomly.R b/tests/testthat/test_splitting_randomly_on_fixed_data.R similarity index 95% rename from tests/testthat/test_splitting_randomly.R rename to tests/testthat/test_splitting_randomly_on_fixed_data.R index 564ccb3..e682fe1 100644 --- a/tests/testthat/test_splitting_randomly.R +++ b/tests/testthat/test_splitting_randomly_on_fixed_data.R @@ -1,140 +1,140 @@ -context("Splitting tests on fixed data and random breaks") - - - - - -test_that("splitting funs congruent with random splitting and fixed data", { - - skip_usually() - - library("Epi") - library("data.table") - - data("occup", package = "Epi") - - occup <- Epi::Lexis( - entry = list(age = AoE, per = DoE), - exit = list(per = DoX), - entry.status = 0L, - exit.status = as.integer(Xst == "D"), - data = occup - ) - - - data("DMlate", package = "Epi") - - DMlate <- Epi::Lexis( - entry = list(fot = 0, per = dodm, age = dodm-dobth), - exit = list(per = dox), - entry.status = 0L, - exit.status = as.integer(!is.na(dodth)), - data = DMlate[DMlate$dox>DMlate$dodm, ] - ) - - data("DMrand", package = "Epi") - - DMrand <- Epi::Lexis( - entry = list(fot = 0, per = dodm, age = dodm-dobth), - exit = list(per = dox), - entry.status = 0L, - exit.status = as.integer(!is.na(dodth)), - data = DMrand[DMrand$dox>DMrand$dodm, ] - ) - - data("thoro", package = "Epi") - - thoro <- Epi::Lexis( - entry = list(fot = 0L, per = injecdat, age = injecdat-birthdat), - exit = list(per = exitdat), - entry.status = 0L, - exit.status = as.integer(exitstat == 1), - data = thoro[thoro$injecdat < thoro$exitdat, ] - ) - - sire <- setDT(copy(popEpi::sire)) - sire[, "dg_yrs" := get.yrs(dg_date, "actual")] - sire[, "ex_yrs" := get.yrs(ex_date, "actual")] - sire[, "bi_yrs" := get.yrs(bi_date, "actual")] - sire[, "id":= 1:.N] - sire <- Lexis( - data=sire[dg_date < ex_date], - entry=list(fot=0, per=dg_yrs, age=dg_age), - exit=list(per=ex_yrs), - merge=TRUE, - exit.status = status, - entry.status = 0L - ) - - - sibr <- setDT(copy(popEpi::sibr)) - sibr[, "dg_yrs" := get.yrs(dg_date, "actual")] - sibr[, "ex_yrs" := get.yrs(ex_date, "actual")] - sibr[, "bi_yrs" := get.yrs(bi_date, "actual")] - sibr[, "id":= 1:.N] - sibr <- Lexis( - data=sibr[dg_date < ex_date], - entry=list(fot=0, per=dg_yrs, age=dg_age), - exit=list(per=ex_yrs), - merge=TRUE, - exit.status = status, - entry.status = 0L - ) - - lex_list <- list( - occup = occup, - DMlate = DMlate, - DMrand = DMrand, - thoro = thoro, - sire = sire, - sibr = sibr - ) - lex_list[] <- lapply(lex_list, function(df) { - df$lex.id <- sample(df$lex.id, nrow(df), replace = FALSE) - df - }) - - n_random_splits <- 500 - invisible(lapply(names(lex_list), function(lex_nm) { - lapply(1:n_random_splits, function(i) { - used_seed <- get_random_seed() - set.seed(used_seed) - l <- random_splitting_on(lex = lex_list[[lex_nm]], n.max.breaks = 50) - # list contents in order: Epi::splitLexis, splitLexisDT, splitMulti - - lapply(2:length(l), function(list_pos) { - - tt_msg <- paste0( - "Epi::splitLexis and ", c("splitLexisDT", "splitMulti")[list_pos-1], - " are in agreement in data '", lex_nm, "' using seed ", used_seed - ) - - test_that(tt_msg, { - expect_equal(l[[1]], l[[list_pos]], check.attributes = FALSE) - }) - - }) - - test_that( - paste0( - "splitLexisDT and splitMulti are in agreement in data '", lex_nm, - "' using seed ", used_seed - ), - expect_equal(l[[2]], l[[3]], check.attributes = TRUE) - ) - - }) - })) - - - -}) - - - - - - - - - +context("Splitting tests on fixed data and random breaks") + + + + + +test_that("splitting funs congruent with random splitting and fixed data", { + + skip_usually() + + library("Epi") + library("data.table") + + data("occup", package = "Epi") + + occup <- Epi::Lexis( + entry = list(age = AoE, per = DoE), + exit = list(per = DoX), + entry.status = 0L, + exit.status = as.integer(Xst == "D"), + data = occup + ) + + + data("DMlate", package = "Epi") + + DMlate <- Epi::Lexis( + entry = list(fot = 0, per = dodm, age = dodm-dobth), + exit = list(per = dox), + entry.status = 0L, + exit.status = as.integer(!is.na(dodth)), + data = DMlate[DMlate$dox>DMlate$dodm, ] + ) + + data("DMrand", package = "Epi") + + DMrand <- Epi::Lexis( + entry = list(fot = 0, per = dodm, age = dodm-dobth), + exit = list(per = dox), + entry.status = 0L, + exit.status = as.integer(!is.na(dodth)), + data = DMrand[DMrand$dox>DMrand$dodm, ] + ) + + data("thoro", package = "Epi") + + thoro <- Epi::Lexis( + entry = list(fot = 0L, per = injecdat, age = injecdat-birthdat), + exit = list(per = exitdat), + entry.status = 0L, + exit.status = as.integer(exitstat == 1), + data = thoro[thoro$injecdat < thoro$exitdat, ] + ) + + sire <- setDT(copy(popEpi::sire)) + sire[, "dg_yrs" := get.yrs(dg_date, "actual")] + sire[, "ex_yrs" := get.yrs(ex_date, "actual")] + sire[, "bi_yrs" := get.yrs(bi_date, "actual")] + sire[, "id":= 1:.N] + sire <- Lexis( + data=sire[dg_date < ex_date], + entry=list(fot=0, per=dg_yrs, age=dg_age), + exit=list(per=ex_yrs), + merge=TRUE, + exit.status = status, + entry.status = 0L + ) + + + sibr <- setDT(copy(popEpi::sibr)) + sibr[, "dg_yrs" := get.yrs(dg_date, "actual")] + sibr[, "ex_yrs" := get.yrs(ex_date, "actual")] + sibr[, "bi_yrs" := get.yrs(bi_date, "actual")] + sibr[, "id":= 1:.N] + sibr <- Lexis( + data=sibr[dg_date < ex_date], + entry=list(fot=0, per=dg_yrs, age=dg_age), + exit=list(per=ex_yrs), + merge=TRUE, + exit.status = status, + entry.status = 0L + ) + + lex_list <- list( + occup = occup, + DMlate = DMlate, + DMrand = DMrand, + thoro = thoro, + sire = sire, + sibr = sibr + ) + lex_list[] <- lapply(lex_list, function(df) { + df$lex.id <- sample(df$lex.id, nrow(df), replace = FALSE) + df + }) + + n_random_splits <- 500 + invisible(lapply(names(lex_list), function(lex_nm) { + lapply(1:n_random_splits, function(i) { + used_seed <- get_random_seed() + set.seed(used_seed) + l <- random_splitting_on(lex = lex_list[[lex_nm]], n.max.breaks = 50) + # list contents in order: Epi::splitLexis, splitLexisDT, splitMulti + + lapply(2:length(l), function(list_pos) { + + tt_msg <- paste0( + "Epi::splitLexis and ", c("splitLexisDT", "splitMulti")[list_pos-1], + " are in agreement in data '", lex_nm, "' using seed ", used_seed + ) + + test_that(tt_msg, { + expect_equal(l[[1]], l[[list_pos]], check.attributes = FALSE) + }) + + }) + + test_that( + paste0( + "splitLexisDT and splitMulti are in agreement in data '", lex_nm, + "' using seed ", used_seed + ), + expect_equal(l[[2]], l[[3]], check.attributes = TRUE) + ) + + }) + })) + + + +}) + + + + + + + + + diff --git a/tests/testthat/test_splitting_randomly_on_random_data.R b/tests/testthat/test_splitting_randomly_on_random_data.R index 700ee5e..db1ce58 100644 --- a/tests/testthat/test_splitting_randomly_on_random_data.R +++ b/tests/testthat/test_splitting_randomly_on_random_data.R @@ -1,59 +1,59 @@ -context("Splitting tests on random data and random breaks") - - - - - -test_that("splitting funs congruent with random splitting and random data", { - - skip_usually() - - library("Epi") - library("data.table") - - n_datasets <- 100 - n_random_splits <- 50 - lapply(seq_len(n_datasets), function(lex_nm) { - - data_seed <- get_random_seed() - lex <- random_Lexis( - n.rows = 100:2000, - n.time.scales = 1:10, - n.statuses = 2:10, - n.other.vars = 1 - ) - - lapply(seq_len(n_random_splits), function(i) { - split_seed <- get_random_seed() - set.seed(split_seed) - l <- random_splitting_on(lex = lex, n.max.breaks = 50) - # list contents in order: Epi::splitLexis, splitLexisDT, splitMulti - - lapply(2:length(l), function(list_pos) { - - tt_msg <- paste0( - "Epi::splitLexis and ", c("splitLexisDT", "splitMulti")[list_pos-1], - " are in agreement in data '", lex_nm, - "' using data seed ", data_seed, " and splitting seed ", split_seed - ) - - test_that(tt_msg, { - expect_equal(l[[1]], l[[list_pos]], check.attributes = FALSE) - }) - - }) - - test_that( - paste0( - "splitLexisDT and splitMulti are in agreement in data '", lex_nm, - "' using data seed ", data_seed, " and splitting seed ", split_seed - ), - expect_equal(l[[2]], l[[3]], check.attributes = TRUE) - ) - - }) - }) - - - -}) +context("Splitting tests on random data and random breaks") + + + + + +test_that("splitting funs congruent with random splitting and random data", { + + skip_usually() + + library("Epi") + library("data.table") + + n_datasets <- 100 + n_random_splits <- 50 + lapply(seq_len(n_datasets), function(lex_nm) { + + data_seed <- get_random_seed() + lex <- random_Lexis( + n.rows = 100:2000, + n.time.scales = 1:10, + n.statuses = 2:10, + n.other.vars = 1 + ) + + lapply(seq_len(n_random_splits), function(i) { + split_seed <- get_random_seed() + set.seed(split_seed) + l <- random_splitting_on(lex = lex, n.max.breaks = 50) + # list contents in order: Epi::splitLexis, splitLexisDT, splitMulti + + lapply(2:length(l), function(list_pos) { + + tt_msg <- paste0( + "Epi::splitLexis and ", c("splitLexisDT", "splitMulti")[list_pos-1], + " are in agreement in data '", lex_nm, + "' using data seed ", data_seed, " and splitting seed ", split_seed + ) + + test_that(tt_msg, { + expect_equal(l[[1]], l[[list_pos]], check.attributes = FALSE) + }) + + }) + + test_that( + paste0( + "splitLexisDT and splitMulti are in agreement in data '", lex_nm, + "' using data seed ", data_seed, " and splitting seed ", split_seed + ), + expect_equal(l[[2]], l[[3]], check.attributes = TRUE) + ) + + }) + }) + + + +}) diff --git a/tests/testthat/test_survmean.R b/tests/testthat/test_survmean.R index 5129787..da397d1 100644 --- a/tests/testthat/test_survmean.R +++ b/tests/testthat/test_survmean.R @@ -1,302 +1,302 @@ -context("mean survival testing") -library("data.table") -library("Epi") -library("survival") - -test_that("survmean() agrees with old results", { - skip_usually() - - sr <- data.table(popEpi::sire)[dg_date < ex_date, ] - sr$agegr <- cut(sr$dg_age, c(0,45,60,Inf), right=FALSE) - - x <- Lexis(entry = list(FUT = 0, AGE = dg_age, CAL = get.yrs(dg_date)), - exit = list(CAL = get.yrs(ex_date)), - data = sr, - exit.status = factor(status, levels = 0:2, - labels = c("alive", "canD", "othD")), - entry.status = factor(0, levels = 0:2, - labels = c("alive", "canD", "othD")), - merge = TRUE) - - ## observed survival - pm <- data.table(popEpi::popmort) - names(pm) <- c("sex", "CAL", "AGE", "haz") - sm <- survmean(Surv(time = FUT, event = lex.Xst != "alive") ~ agegr, - pophaz = pm, data = x, - breaks = list(FUT = seq(0, 10, 1/12)), - e1.breaks = list(FUT = c(seq(0, 10, 1/12), 11:100))) - - ## values to test against computed on 2016-03-04; - ## git ref: 5077677 - expect_equal(sm$est, c(33.951439, 21.611419, 7.604318), tol = 0.005, scale = 1) - expect_equal(sm$exp, c(45.25686, 31.22712, 13.06725), tol = 0.005, scale = 1) - - -}) - - - -# test_that("survmean() agrees with results computed using pkg survival", { -# skip_on_cran() -# -# -# BL <- list(fot= seq(0,15,1/24)) -# eBL <- list(fot = unique(c(BL$fot, seq(15, 115,0.5)))) -# -# sire2 <- data.table(popEpi::sire)[dg_date= "1998-01-01" & dg_date < "2003-01-01", - pophaz = pm, data = x, - r = 1, - breaks = BL, - e1.breaks = eBL) - - - BL <- list(FUT = seq(0, 5, 1/12), CAL = c(1998,2003)) - eBL <- list(FUT = c(BL$FUT, seq(max(BL$FUT), 10, 1/12), seq(10,110,1/2))) - eBL$FUT <- sort(unique(eBL$FUT)) - smp <- survmean(Surv(time = FUT, event = lex.Xst != "alive") ~ 1, - pophaz = pm, data = x, - breaks = BL, - r = 1, - e1.breaks = eBL) - - - - expect_equal(sm$obs, smp$obs) - expect_equal(sm$exp, smp$exp) - expect_equal(smp$est, 10.01542, tol = 0.0005, scale = 1) - expect_equal(sm$est, 10.0216, tol = 0.0005, scale = 1) - -}) - - - -test_that("Dates and frac. yrs produce congruent results", { - skip_on_cran() - - x <- data.table(popEpi::sire) - x <- x[dg_date= "1998-01-01" & dg_date < "2003-01-01", + pophaz = pm, data = x, + r = 1, + breaks = BL, + e1.breaks = eBL) + + + BL <- list(FUT = seq(0, 5, 1/12), CAL = c(1998,2003)) + eBL <- list(FUT = c(BL$FUT, seq(max(BL$FUT), 10, 1/12), seq(10,110,1/2))) + eBL$FUT <- sort(unique(eBL$FUT)) + smp <- survmean(Surv(time = FUT, event = lex.Xst != "alive") ~ 1, + pophaz = pm, data = x, + breaks = BL, + r = 1, + e1.breaks = eBL) + + + + expect_equal(sm$obs, smp$obs) + expect_equal(sm$exp, smp$exp) + expect_equal(smp$est, 10.01542, tol = 0.0005, scale = 1) + expect_equal(sm$est, 10.0216, tol = 0.0005, scale = 1) + +}) + + + +test_that("Dates and frac. yrs produce congruent results", { + skip_on_cran() + + x <- data.table(popEpi::sire) + x <- x[dg_date 8.49), - data = x, surv.type="surv.obs") - }, regexp = NA) - - ## INTENTION: 7.5+ intervals empty for one age group. - ## this should make adjusted estimates missing altogether for 7.5+. - expect_message({ - st2 <- survtab(Surv(fot, lex.Xst) ~ adjust(agegr), - data = x, surv.type="surv.obs", - subset = !(agegr == 1L & fot > 8.49), - weights = list(agegr = c(0.33, 0.33, 0.33))) - }, regexp = NA) - setDT(st1) - setDT(st2) - - - expect_equal(st1[agegr==3 & Tstop>8.5, .N] , 18L) - expect_equal(st1[agegr==1 & Tstop>8.5, .N] , 0L) - expect_equal(st2[Tstop > 8.5, .N] , 0L) -}) - -## non-consecutively bad surv.ints --------------------------------------------- - -test_that("survtab_ag messages & results due to non-consecutively bad surv.ints are OK", { - ## non-consecutively bad surv.ints (missing years 5-6) - sire2 <- sire[dg_date < ex_date, ] - sire2[, agegr := cut(dg_age, c(0,45,60,Inf), right=FALSE, labels=FALSE)] - sire2 <- sire2[!(dg_age > 60 & as.integer(as.integer(ex_date-dg_date)/365.25) %in% 5:6)] - BL <- list(fot= seq(0,10,1/12), per=c(2008,2013)) - x <- lexpand(sire2, birth = bi_date, entry = dg_date, exit = ex_date, - status = status %in% 1:2, - breaks=BL) - tf1 <- quote( - st1 <- survtab(Surv(fot, lex.Xst)~1, data = x, surv.type="surv.obs", - subset=!(fot >= 5 & fot < 7)) - ) - - tf2 <- quote( - st2 <- survtab(Surv(fot, lex.Xst)~adjust(agegr), data = x, surv.type="surv.obs", - subset=!(agegr==3 & fot >= 5 & fot < 7), - weights = list(agegr = c(0.33, 0.33, 0.33))) - ) - - ## NOTE: \\ needed before "(" or ")" - msgs <- c(paste0("The total person-time was zero in some survival ", - "intervals summed to the margins \\(over any stratifying ", - "/ adjusting variables\\) _non-consecutively_, i.e. some ", - "intervals after an empty interval had person-time in ", - "them. Keeping all survival intervals with some estimates ", - "as NA for inspection."), - "Some cumulative surv.obs were zero or NA:") - expect_message(eval(tf1), msgs[1],ignore.case=TRUE) - expect_message(eval(tf1), msgs[2],ignore.case=TRUE) - - setDT(st1) - - expect_equal(st1[is.na(surv.obs), .N], 60L) - - msgs <- c(paste0("The total person-time was zero in some survival ", - "intervals, when summed to the variable\\(s\\) ", - "'agegr' \\(i.e. over all other variables, if any", - "\\) _non-consecutively_, i.e. some intervals after ", - "an empty interval had person-time in them. ", - "Keeping all survival intervals with some ", - "estimates as NA for inspection."), - "Some cumulative surv.obs were zero or NA:") - - expect_message(eval(tf2), msgs[1]) - expect_message(eval(tf2), msgs[2]) - - setDT(st2) - expect_equal(st2[is.na(surv.obs.as), .N], 60L) -}) - - - - - +context("Testing empty survival intervals in survtab") + +test_that("removing consecutively bad surv.ints is logical w/ & w/out adjusting", { + + sire2 <- sire[dg_date < ex_date, ] + sire2[, agegr := cut(dg_age, c(0,45,60,Inf), right=FALSE, labels=FALSE)] + + BL <- list(fot= seq(0,10,1/12), per=c(2008,2013)) + + x <- lexpand(sire2, birth = bi_date, entry = dg_date, exit = ex_date, + status = status %in% 1:2, breaks=BL) + setDT(x) + + setattr(x, "class", c("Lexis", "data.table", "data.frame")) + ## NOTE: neither should give any messages! + expect_message({ + st1 <- survtab(Surv(fot, lex.Xst) ~ agegr, + subset = !(agegr == 1L & fot > 8.49), + data = x, surv.type="surv.obs") + }, regexp = NA) + + ## INTENTION: 7.5+ intervals empty for one age group. + ## this should make adjusted estimates missing altogether for 7.5+. + expect_message({ + st2 <- survtab(Surv(fot, lex.Xst) ~ adjust(agegr), + data = x, surv.type="surv.obs", + subset = !(agegr == 1L & fot > 8.49), + weights = list(agegr = c(0.33, 0.33, 0.33))) + }, regexp = NA) + setDT(st1) + setDT(st2) + + + expect_equal(st1[agegr==3 & Tstop>8.5, .N] , 18L) + expect_equal(st1[agegr==1 & Tstop>8.5, .N] , 0L) + expect_equal(st2[Tstop > 8.5, .N] , 0L) +}) + +## non-consecutively bad surv.ints --------------------------------------------- + +test_that("survtab_ag messages & results due to non-consecutively bad surv.ints are OK", { + ## non-consecutively bad surv.ints (missing years 5-6) + sire2 <- sire[dg_date < ex_date, ] + sire2[, agegr := cut(dg_age, c(0,45,60,Inf), right=FALSE, labels=FALSE)] + sire2 <- sire2[!(dg_age > 60 & as.integer(as.integer(ex_date-dg_date)/365.25) %in% 5:6)] + BL <- list(fot= seq(0,10,1/12), per=c(2008,2013)) + x <- lexpand(sire2, birth = bi_date, entry = dg_date, exit = ex_date, + status = status %in% 1:2, + breaks=BL) + tf1 <- quote( + st1 <- survtab(Surv(fot, lex.Xst)~1, data = x, surv.type="surv.obs", + subset=!(fot >= 5 & fot < 7)) + ) + + tf2 <- quote( + st2 <- survtab(Surv(fot, lex.Xst)~adjust(agegr), data = x, surv.type="surv.obs", + subset=!(agegr==3 & fot >= 5 & fot < 7), + weights = list(agegr = c(0.33, 0.33, 0.33))) + ) + + ## NOTE: \\ needed before "(" or ")" + msgs <- c(paste0("The total person-time was zero in some survival ", + "intervals summed to the margins \\(over any stratifying ", + "/ adjusting variables\\) _non-consecutively_, i.e. some ", + "intervals after an empty interval had person-time in ", + "them. Keeping all survival intervals with some estimates ", + "as NA for inspection."), + "Some cumulative surv.obs were zero or NA:") + expect_message(eval(tf1), msgs[1],ignore.case=TRUE) + expect_message(eval(tf1), msgs[2],ignore.case=TRUE) + + setDT(st1) + + expect_equal(st1[is.na(surv.obs), .N], 60L) + + msgs <- c(paste0("The total person-time was zero in some survival ", + "intervals, when summed to the variable\\(s\\) ", + "'agegr' \\(i.e. over all other variables, if any", + "\\) _non-consecutively_, i.e. some intervals after ", + "an empty interval had person-time in them. ", + "Keeping all survival intervals with some ", + "estimates as NA for inspection."), + "Some cumulative surv.obs were zero or NA:") + + expect_message(eval(tf2), msgs[1]) + expect_message(eval(tf2), msgs[2]) + + setDT(st2) + expect_equal(st2[is.na(surv.obs.as), .N], 60L) +}) + + + + + diff --git a/tests/testthat/test_survtab_observed.R b/tests/testthat/test_survtab_observed.R index d9d00e4..fad7380 100644 --- a/tests/testthat/test_survtab_observed.R +++ b/tests/testthat/test_survtab_observed.R @@ -1,133 +1,142 @@ -context("CIF's & surv.obs's congruence & comparison w/ survival::survfit") - -test_that("surv.obs about the same as Kaplan-Meier & CIFs close to Aalen-Johansen", { - library(survival) - BL <- list(fot= seq(0,19,1/12), per=c(2008,2013)) - sire2 <- sire[dg_date 1", { - data(sire) - set.seed(1) - sire <- sire[sample(1:.N, 100)] - - BL <- list(fot=0:5) - x <- lexpand(sire, - birth = bi_date, entry = dg_date, exit = ex_date, - status = status, - breaks = BL, - pophaz = popmort, - aggre = list(fot)) - - st <- survtab_ag(fot ~ 1, data = x, - surv.method = "lifetable", - n.cens = c("from0to0", "from0to2"), d = "from0to1") - st <- survtab_ag(fot ~ 1, data = x, - surv.method = "lifetable", - relsurv.method = "pp", - n.cens = c("from0to0", "from0to2"), - d = "from0to1", - d.pp = "from0to1.pp", - d.pp.2 = "from0to1.pp.2", - n.pp = "at.risk.pp", - n.cens.pp = c("from0to0.pp", "from0to2.pp")) - st <- survtab_ag(fot ~ 1, data = x, - surv.method = "lifetable", - relsurv.method = "pp", - d = c("from0to0", "from0to2"), - n.cens = "from0to1", - n.cens.pp = "from0to1.pp", - n.pp = "at.risk.pp", - d.pp = c("from0to0.pp", "from0to2.pp"), - d.pp.2 = c("from0to0.pp.2", "from0to2.pp.2")) -}) - - - - - -test_that("update() works with survtab objects", { - data(sire) - set.seed(1) - sire <- sire[sample(1:.N, 100)] - - BL <- list(fot=seq(0, 5, by = 1/12), - per = c("2008-01-01", "2013-01-01")) - x <- lexpand(sire, - birth = bi_date, entry = dg_date, exit = ex_date, - status = status %in% 1:2, - breaks = BL, - pophaz = popmort, - aggre = list(sex, fot)) - - - st <- survtab_ag(fot ~ 1, data = x) - sts <- survtab_ag(fot ~ sex, data = x) - - expect_equal(sts, update(st, formula. = fot ~ sex)) - - - library(Epi) - library(survival) - x <- Lexis(entry = list(FUT = 0, AGE = dg_age, CAL = get.yrs(dg_date)), - exit = list(CAL = get.yrs(ex_date)), - data = sire[sire$dg_date < sire$ex_date, ], - entry.status = 0L, - exit.status = as.integer(status %in% 1:2), - merge = TRUE) - - set.seed(1L) - x$group <- rbinom(nrow(x), 1, 0.5) - - st <- survtab(FUT ~ group, data = x, - surv.type = "surv.obs", - breaks = list(FUT = seq(0, 5, 1/12))) - - sts <- survtab(FUT ~ 1, data = x, - surv.type = "surv.obs", - breaks = list(FUT = seq(0, 5, 1/12))) - - expect_equal(sts, update(st, . ~ -group)) - -}) - - - - - -test_that("internal weights work as intended", { - library("data.table") - data("sire") - sire$agegr <- cut(sire$dg_age,c(0,45,55,65,75,Inf),right=F) - BL <- list(fot=seq(0, 5, by = 1/12), - per = c("2008-01-01", "2013-01-01")) - x <- lexpand(sire, birth = bi_date, entry = dg_date, exit = ex_date, - status = status %in% 1:2, - breaks = BL, - pophaz = popmort, - aggre = list(fot,agegr)) - - ## age standardisation using internal weights (age distribution of - ## patients diagnosed within the period window) - w <- x[fot == 0, .(weights = sum(at.risk)), keyby = agegr] - - st <- survtab_ag(fot ~ adjust(agegr), data = x, weights=w) - - st2 <- survtab_ag(fot ~ adjust(agegr), data = x, weights = "internal") - - expect_equal(st$surv.obs.as.lo, st2$surv.obs.as.lo) - -}) - - - - - -test_that("survtab_ag works with bare data.frames", { - - data(sire) - - BL <- list(fot = 0:5, - per = c("2008-01-01", "2013-01-01")) - x <- lexpand(sire, birth = bi_date, entry = dg_date, exit = ex_date, - status = status %in% 1:2, - breaks = BL, - aggre = list(fot)) - - e <- quote(survtab_ag(fot ~ 1, data = x, surv.type = "surv.obs")) - eb <- quote(survtab_ag(fot ~ 1, data = x, surv.type = "surv.obs", - surv.breaks = 0:5)) - - la <- list(eval(e), eval(eb)) - expect_equal(la[[1]]$surv.obs.hi, la[[2]]$surv.obs.hi) - - - x <- data.frame(x) - er <- paste0("Data did not contain breaks and no breaks ", - "were supplied by hand.") - expect_error(eval(e), regexp = er) - expect_equal(eval(eb)$surv.obs.hi, la[[2]]$surv.obs.hi) - -}) - - - - - - -test_that("confidence intervals are as intended", { - skip_on_cran() - - library(Epi) - library(survival) - - ## NOTE: recommended to use factor status variable - x <- Lexis(entry = list(FUT = 0, AGE = dg_age, CAL = get.yrs(dg_date)), - exit = list(CAL = get.yrs(ex_date)), - data = popEpi::sire[sire$dg_date < sire$ex_date, ], - entry.status = 0L, - exit.status = as.integer(status %in% 1:2), - merge = TRUE) - - ## phony group variable - set.seed(1L) - x$group <- rbinom(nrow(x), 1, 0.5) - - ## log-log transformation - st <- survtab(FUT ~ group, data = x, - surv.type = "surv.obs", - breaks = list(FUT = seq(0, 5, 1/12)), - conf.type = "log-log", conf.level = 0.99) - - dt <- data.table(st) - dt[, "SE.A" := sqrt(SE.surv.obs^2*(1/(surv.obs*log(surv.obs)))^2)] - dt[, "s.lo" := surv.obs^exp(qnorm(0.995)*SE.A)] - dt[, "s.hi" := surv.obs^exp(qnorm(0.005)*SE.A)] - - expect_equal(dt[, .(lo = surv.obs.lo, hi = surv.obs.hi)], - dt[, .(lo = s.lo, hi = s.hi)]) - - ## log transformation - st <- survtab(FUT ~ group, data = x, - surv.type = "surv.obs", - breaks = list(FUT = seq(0, 5, 1/12)), - conf.type = "log", conf.level = 0.80) - - dt <- data.table(st) - dt[, "SE.A" := SE.surv.obs/surv.obs] - dt[, "s.lo" := surv.obs*exp(qnorm(0.10)*SE.A)] - dt[, "s.hi" := surv.obs*exp(qnorm(0.90)*SE.A)] - - expect_equal(dt[, .(lo = surv.obs.lo, hi = surv.obs.hi)], - dt[, .(lo = s.lo, hi = s.hi)]) - - -}) - - - - - - +context("survtab usage") + + + +test_that("Dates and frac. yrs produce congruent results", { + skip_on_cran() + library(Epi) + library(survival) + + x <- data.table(popEpi::sire) + x <- x[dg_date 1", { + data(sire) + set.seed(1) + sire <- sire[sample(1:.N, 100)] + + BL <- list(fot=0:5) + x <- lexpand(sire, + birth = bi_date, entry = dg_date, exit = ex_date, + status = status, + breaks = BL, + pophaz = popmort, + aggre = list(fot)) + + st <- survtab_ag(fot ~ 1, data = x, + surv.method = "lifetable", + n.cens = c("from0to0", "from0to2"), d = "from0to1") + st <- survtab_ag(fot ~ 1, data = x, + surv.method = "lifetable", + relsurv.method = "pp", + n.cens = c("from0to0", "from0to2"), + d = "from0to1", + d.pp = "from0to1.pp", + d.pp.2 = "from0to1.pp.2", + n.pp = "at.risk.pp", + n.cens.pp = c("from0to0.pp", "from0to2.pp")) + st <- survtab_ag(fot ~ 1, data = x, + surv.method = "lifetable", + relsurv.method = "pp", + d = c("from0to0", "from0to2"), + n.cens = "from0to1", + n.cens.pp = "from0to1.pp", + n.pp = "at.risk.pp", + d.pp = c("from0to0.pp", "from0to2.pp"), + d.pp.2 = c("from0to0.pp.2", "from0to2.pp.2")) +}) + + + + + +test_that("update() works with survtab objects", { + data(sire) + set.seed(1) + sire <- sire[sample(1:.N, 100)] + + BL <- list(fot=seq(0, 5, by = 1/12), + per = c("2008-01-01", "2013-01-01")) + x <- lexpand(sire, + birth = bi_date, entry = dg_date, exit = ex_date, + status = status %in% 1:2, + breaks = BL, + pophaz = popmort, + aggre = list(sex, fot)) + + + st <- survtab_ag(fot ~ 1, data = x) + sts <- survtab_ag(fot ~ sex, data = x) + + expect_equal(sts, update(st, formula. = fot ~ sex)) + + + library(Epi) + library(survival) + x <- Lexis(entry = list(FUT = 0, AGE = dg_age, CAL = get.yrs(dg_date)), + exit = list(CAL = get.yrs(ex_date)), + data = sire[sire$dg_date < sire$ex_date, ], + entry.status = 0L, + exit.status = as.integer(status %in% 1:2), + merge = TRUE) + + set.seed(1L) + x$group <- rbinom(nrow(x), 1, 0.5) + + st <- survtab(FUT ~ group, data = x, + surv.type = "surv.obs", + breaks = list(FUT = seq(0, 5, 1/12))) + + sts <- survtab(FUT ~ 1, data = x, + surv.type = "surv.obs", + breaks = list(FUT = seq(0, 5, 1/12))) + + expect_equal(sts, update(st, . ~ -group)) + +}) + + + + + +test_that("internal weights work as intended", { + library("data.table") + data("sire") + sire$agegr <- cut(sire$dg_age,c(0,45,55,65,75,Inf),right=F) + BL <- list(fot=seq(0, 5, by = 1/12), + per = c("2008-01-01", "2013-01-01")) + x <- lexpand(sire, birth = bi_date, entry = dg_date, exit = ex_date, + status = status %in% 1:2, + breaks = BL, + pophaz = popmort, + aggre = list(fot,agegr)) + + ## age standardisation using internal weights (age distribution of + ## patients diagnosed within the period window) + w <- x[fot == 0, .(weights = sum(at.risk)), keyby = agegr] + + st <- survtab_ag(fot ~ adjust(agegr), data = x, weights=w) + + st2 <- survtab_ag(fot ~ adjust(agegr), data = x, weights = "internal") + + expect_equal(st$surv.obs.as.lo, st2$surv.obs.as.lo) + +}) + + + + + +test_that("survtab_ag works with bare data.frames", { + + data(sire) + + BL <- list(fot = 0:5, + per = c("2008-01-01", "2013-01-01")) + x <- lexpand(sire, birth = bi_date, entry = dg_date, exit = ex_date, + status = status %in% 1:2, + breaks = BL, + aggre = list(fot)) + + e <- quote(survtab_ag(fot ~ 1, data = x, surv.type = "surv.obs")) + eb <- quote(survtab_ag(fot ~ 1, data = x, surv.type = "surv.obs", + surv.breaks = 0:5)) + + la <- list(eval(e), eval(eb)) + expect_equal(la[[1]]$surv.obs.hi, la[[2]]$surv.obs.hi) + + + x <- data.frame(x) + er <- paste0("Data did not contain breaks and no breaks ", + "were supplied by hand.") + expect_error(eval(e), regexp = er) + expect_equal(eval(eb)$surv.obs.hi, la[[2]]$surv.obs.hi) + +}) + + + + + + +test_that("confidence intervals are as intended", { + skip_on_cran() + + library(Epi) + library(survival) + + ## NOTE: recommended to use factor status variable + x <- Lexis(entry = list(FUT = 0, AGE = dg_age, CAL = get.yrs(dg_date)), + exit = list(CAL = get.yrs(ex_date)), + data = popEpi::sire[sire$dg_date < sire$ex_date, ], + entry.status = 0L, + exit.status = as.integer(status %in% 1:2), + merge = TRUE) + + ## phony group variable + set.seed(1L) + x$group <- rbinom(nrow(x), 1, 0.5) + + ## log-log transformation + st <- survtab(FUT ~ group, data = x, + surv.type = "surv.obs", + breaks = list(FUT = seq(0, 5, 1/12)), + conf.type = "log-log", conf.level = 0.99) + + dt <- data.table(st) + dt[, "SE.A" := sqrt(SE.surv.obs^2*(1/(surv.obs*log(surv.obs)))^2)] + dt[, "s.lo" := surv.obs^exp(qnorm(0.995)*SE.A)] + dt[, "s.hi" := surv.obs^exp(qnorm(0.005)*SE.A)] + + expect_equal(dt[, .(lo = surv.obs.lo, hi = surv.obs.hi)], + dt[, .(lo = s.lo, hi = s.hi)]) + + ## log transformation + st <- survtab(FUT ~ group, data = x, + surv.type = "surv.obs", + breaks = list(FUT = seq(0, 5, 1/12)), + conf.type = "log", conf.level = 0.80) + + dt <- data.table(st) + dt[, "SE.A" := SE.surv.obs/surv.obs] + dt[, "s.lo" := surv.obs*exp(qnorm(0.10)*SE.A)] + dt[, "s.hi" := surv.obs*exp(qnorm(0.90)*SE.A)] + + expect_equal(dt[, .(lo = surv.obs.lo, hi = surv.obs.hi)], + dt[, .(lo = s.lo, hi = s.hi)]) + + +}) + + + + + + diff --git a/tests/testthat/test_utils.R b/tests/testthat/test_utils.R index 8ee9167..ca0acf2 100644 --- a/tests/testthat/test_utils.R +++ b/tests/testthat/test_utils.R @@ -1,417 +1,417 @@ -context("utility functions") - -test_that("subsetting in ltable works and ltable has no side effects", { - skip_on_cran() - - sr <- popEpi::sire[1:100, ] - set.seed(1L) - sr[, sex := rbinom(.N, 1, prob = 0.5)] - sr[c(1, 50), sex := NA] - - setkeyv(sr, "bi_date") - old_sr <- copy(sr) - - lt1 <- ltable(sr, by = "sex", subset = sex == 0, na.rm = TRUE) - lt2 <- ltable(sr, by = "sex", subset = sex == 1, na.rm = TRUE) - lt3 <- ltable(sr, by = "sex", na.rm = TRUE) - - expect_equal(lt3$obs, c(lt1[1, ]$obs, lt2[2, ]$obs)) - expect_true(all.equal(sr, old_sr)) - -}) - - - -test_that("ltable works with NA values", { - - skip_on_cran() - - sr <- setDT(popEpi::sire[1:100, ]) - set.seed(1L) - sr[, sex := rbinom(.N, 1, prob = 0.5)] - sr[c(1, 50), sex := NA] - - lt1 <- ltable(sr, by = "sex", na.rm = FALSE) - lt2 <- ltable(sr, by = "sex", na.rm = TRUE) - - expect_equal(lt1[!is.na(sex),], lt2) - -}) - - - -test_that("evalPopArg produces intended results",{ - set.seed(1L) - dt <- data.table(a = rbinom(10, 100, 0.25), b = 1:2, c = 1:5) - - tf <- function(x=dt, arg) { - - as <- substitute(arg) - byTab <- evalPopArg(x, arg = as, enclos = parent.frame(1L)) - - x[, list(sum = sum(a)), by = byTab] - - } - - ## symbol - t1 <- tf(arg=b) - - ## name string - t2 <- tf(arg="b") - - expect_equal(t1$sum, c(127, 131)) - expect_equal(t1, t2) - - ## list of symbols / expressions - t3 <- tf(arg=list(b, c)) - - ## name strings - t4 <- tf(arg=c("b", "c")) - - ## object containing name strings - byVars <- c("b", "c") - t5 <- tf(arg=byVars) - - expect_equal(t4$sum, c(22,24,26,31,21, 31,32,27,26,18)) - expect_equal(t4, t3) - expect_equal(t4, t5) - - ## list of symbols / expressions - t6 <- tf(arg=list(var1 = b,c, cut(c,3))) - expect_equal(names(t6), c("var1", "c", "cut", "sum")) - - - ## NULL object - byVars <- NULL - t7 <- tf(arg=byVars) - t8 <- tf(arg=NULL) - expect_equal(t7, t8) - - ## a list of predetermined values - byList <- as.list(dt[, list(b, var1 = c)]) - t9 <- tf(arg=byList) - - ## list without any names - byList <- list(dt$b, dt$c) - t10<- tf(arg=byList) - - ## partially named list - byList <- list(var1 = dt$b, dt$c) - t11<- tf(arg=byList) - - expect_equal(t9$sum, t10$sum) - expect_equal(t10$sum, t11$sum) - expect_equal(names(t11), c("var1", "BV2", "sum")) - - - t12 <- tf(arg=list(V0=dt$b, dt$c)) - byList <- list(V0 = dt$b, dt$c) - t13 <- tf(arg=byList) - expect_equal(t12, t13) - - ## pre-substituted list - bl <- substitute(byList) - t14 <- tf(arg = bl) - expect_equal(t12, t14) - - ## pre-substituted vector of names - nv <- c("a", "b") - nvs <- substitute(nv) - t15a <- tf(arg = nv) - t15b <- tf(arg = nvs) - expect_equal(t15a, t15b) - - ## nested functions - tf2 <- function(a, x = dt) { - tf(x = x, arg = a) - } - - nv <- c("a", "b") - nvs <- substitute(nv) - t15a <- tf2(a = nv) - t15b <- tf2(a = nvs) - expect_equal(t15a, t15b) -}) - - -test_that("cutLowMerge merges succesfully what is intended", { - skip_on_cran() - all_names_present(popEpi::popmort, c("sex", "year", "agegroup", "haz")) - all_names_present(popEpi::sire, c("sex", "bi_date", "dg_date", "ex_date", "status")) - - pm <- copy(popEpi::popmort) - pm[, haz := rbinom(.N, 100, 0.5)/1e5L] - - sr <- popEpi::sire[1:100,] - setDT(sr) - sr1 <- lexpand(sr, birth = bi_date, entry = dg_date, exit = ex_date, - status = status, fot = seq(0, 5, 1/12)) - sr1 <- data.table(sr1) - setattr(sr1, "class", c("Lexis", "data.table", "data.frame")) - - sr1[, year := per + 0.5*lex.dur] - sr1[, agegroup := age + 0.5*lex.dur] - - sr2 <- cutLowMerge(sr1, pm, - by.x = c("sex", "per", "age"), - by.y = c("sex", "year", "agegroup"), - all.x = TRUE, all.y = FALSE, old.nums = TRUE) - - sr3 <- copy(sr2) - sr3[, haz := NULL] - - sr4 <- lexpand(sr, birth = bi_date, entry = dg_date, exit = ex_date, - status = status, fot = seq(0, 5, 1/12), pophaz = pm, pp = FALSE) - expect_equal(sr1, sr3, check.attributes = FALSE) - expect_equal(sr2$haz*1e5L, sr4$pop.haz*1e5L, check.attributes = FALSE) - - sr1[, year := popEpi:::cutLow(year, breaks = sort(unique(pm$year)))] - sr1[, agegroup := popEpi:::cutLow(agegroup, breaks = sort(unique(pm$agegroup)))] - - sr5 <- merge(sr1, pm, by = c("sex", "year", "agegroup")) - setDT(sr5) - setkey(sr5, lex.id, fot) - - expect_equal(sr4$haz*1e5L, sr5$pop.haz*1e5L, check.attributes = FALSE) -}) - -test_that("detectEvents works as intended", { - skip_on_cran() - x <- sire[dg_date - %\VignetteEngine{knitr::rmarkdown} - %\VignetteIndexEntry{Standardised incidence and mortality ratios} - %\usepackage[utf8]{inputenc} ---- - -```{r, echo=TRUE, warning=FALSE, message=FALSE} -library(popEpi) -library(Epi) -library(splines) -``` - -# Introduction - -Standardized incidence ratio (SIR) or mortality ratio (SMR) is a ratio of observed and expected cases. Observed cases is the absolute number of cases in the cohort. The expected cases are derived by multiplying the cohort person-years with reference populations rate. The rate should be stratified or adjusted by confounding factors. Usually these are age group, gender, calendar period and possibly a cancer type or other confounding variable. Also a social economic status or area variable can be used. - -In reference population the expected rate in strata $j$ is $\lambda_j = d_j$ / $n_j$, where $d_j$ is observed cases and $n_j$ is observed person years. Now the SIR can be written as a ratio -$$ -SIR = \frac{ \sum d_j }{\sum n_j \lambda_j} = \frac{D}{E} -$$ -where $D$ is the observed cases in cohort population and $E$ is the expected number. Univariate confidence intervals are based on exact values of Poisson distribution and the formula for p-value is -$$ -\chi^2 = \frac{ (|O - E| -0.5)^2 }{E}. -$$ -Modelled SIR is a Poisson regression model with log-link and cohorts person-years as a offset. - -The homogenity of SIR's can be tested using a likelihood ratio test in Poisson modelled SIRs. - -The same workflow applies for standardised mortality ratios. - -# Splines - -A continuous spline function can be fitted for time variables, e.g. age-group. Idea of the splines is to smooth the SMR estimates and do inference from the curve figure. This requires pre-defined knots/nodes that are used to fit the spline curve. Selecting the number of knots and knot places is a very subjective matter and there are three options to pass spline knots to function. - -It's good practice to try between different knot settings for realistic spline estimates. Overfitting might cause unintentioal artefacts in the estimate and underfitting might smooth away interesting patterns. - -The spline variable should be as continuous as possible, say from 18 to 100 time points. But when splitting time in too narrow intervals, random variation might occur in the expected or population rate values. Therefore it's also possible to do two variables for age or period: first with wider intervals for standardation and second with narrow intervals for the spline. - -## Knots - -There are three options to for assigning knots to the spline: - -1. A vector of numbers of knots for each spline variable. Number of knots includes the boundary knots, so that the minumum number of knots is 2, which is a log linear assosiation. The knots are placed automatically using the quantiles of observed cases. - -2. A list of vectors of predefined knot places. Number of vectors needs to match the length of spline variables. And each vector has to have at least the minimum and maximum for boundary knots. - -3. NULL will automatically finds the optimal number of knots based on AIC. Knots are placed according the quantiles of observed cases. This is usually a good place to start the fitting process. - -Number of knots and knot places are always found in output. - -# SMR - -## Mortality: External cohort and popmort data - -Estimate SMR of a simulated cohort of Finnish female rectal cancer patients, `sire`. -Death rates for each age, period and sex is available in `popmort` dataset. - -For more information about the dataset see `help(popmort)` and `help(sire)`. - -```{r} -data(sire) -data(popmort) -c <- lexpand( sire, status = status, birth = bi_date, exit = ex_date, entry = dg_date, - breaks = list(per = 1950:2013, age = 1:100, fot = c(0,10,20,Inf)), - aggre = list(fot, agegroup = age, year = per, sex) ) - -se <- sir( coh.data = c, coh.obs = 'from0to2', coh.pyrs = 'pyrs', - ref.data = popmort, ref.rate = 'haz', - adjust = c('agegroup','year','sex'), print ='fot') -se -``` - -SMR's for other causes is 1 for both follow-up intervals. Also the p-value suggest that there is no heterogenity between SMR estimates (p=0.735). - - -The total mortality can be estimated by modifying the `status` argument. Now we want to account all deaths, i.e. status is 1 or 2. - -```{r} -c <- lexpand( sire, status = status %in% 1:2, birth = bi_date, exit = ex_date, entry = dg_date, - breaks = list(per = 1950:2013, age = 1:100, fot = c(0,10,20,Inf)), - aggre = list(fot, agegroup = age, year = per, sex) ) - -se <- sir( coh.data = c, coh.obs = 'from0to1', coh.pyrs = 'pyrs', - ref.data = popmort, ref.rate = 'haz', - adjust = c('agegroup','year','sex'), print ='fot') -se -``` - -Now the estimates for follow-up intervals seems to differ significantly, p = 0. Plotting SMR (S3-method for `sir`-object) is easily done using default plot-function. - -```{r, fig.height=3, fig.width=6} -plot(se, col = 2:3) -title('SMR for follow-up categories') -``` - - -## splines - - -Lets fit splines for the follow-up time and agegroup using two different options: the splines are fitted in different model and in same model, `dependent.splines`. - -```{r, fig.height=5, fig.width=6} -c <- lexpand( sire, status = status %in% 1:2, birth = bi_date, exit = ex_date, entry = dg_date, - breaks = list(per = 1950:2013, age = 1:100, fot = 0:50), - aggre = list(fot, agegroup = age, year = per, sex) ) - -sf <- sirspline( coh.data = c, coh.obs = 'from0to1', coh.pyrs = 'pyrs', - ref.data = popmort, ref.rate = 'haz', - adjust = c('agegroup','year','sex'), - spline = c('agegroup','fot'), dependent.splines=FALSE) - -st <- sirspline( coh.data = c, coh.obs = 'from0to1', coh.pyrs = 'pyrs', - ref.data = popmort, ref.rate = 'haz', - adjust = c('agegroup','year','sex'), - spline = c('agegroup','fot'), dependent.splines = TRUE) - -plot(sf, col=2, log=TRUE) -title('Splines fitted in different models') - -plot(st, col=4, log=TRUE) -title('Splines are dependent') -``` - -In dependent spline the `fot` is the ratio with zero time as reference point. Reference points can be alterned. Here agegroup profile is assumed to be same for every follow-up time. SMR is 0.2 times from 0 to 10 years of follow-up. - - -Splines can also be stratified using the `print` argument. For example we split the death time in two time periods and test if the agegroup splines are equal. - -```{r, results='hide', fig.height=5, fig.width=6} -c$year.cat <- ifelse(c$year < 2002, 1, 2) -sy <- sirspline( coh.data = c, coh.obs = 'from0to1', coh.pyrs = 'pyrs', - ref.data = popmort, ref.rate = 'haz', - adjust = c('agegroup','year','sex'), - spline = c('agegroup'), print = 'year.cat') -plot(sy, log=TRUE) -legend('topright', c('before 2002','after 2002'), lty=1, col=c(1,2)) -``` - -For category before 2002 the SMR seems to be higher after the age of 50. Also the p-value (<0.0001) indicates that there is a difference in age group trends before and after year 2002. P-value is a likelihood ratio test that compares models where splines are fitted together and separately. - -```{r} -print(sy) -``` - - - - - - - +--- +title: "SMR Vignette" +author: "Matti Rantanen" +date: "`r Sys.Date()`" +output: + html_document: + fig_caption: yes + toc: true + toc_depth: 2 +vignette: > + %\VignetteEngine{knitr::rmarkdown} + %\VignetteIndexEntry{Standardised incidence and mortality ratios} + %\usepackage[utf8]{inputenc} +--- + +```{r, echo=TRUE, warning=FALSE, message=FALSE} +library(popEpi) +library(Epi) +library(splines) +``` + +# Introduction + +Standardized incidence ratio (SIR) or mortality ratio (SMR) is a ratio of observed and expected cases. Observed cases is the absolute number of cases in the cohort. The expected cases are derived by multiplying the cohort person-years with reference populations rate. The rate should be stratified or adjusted by confounding factors. Usually these are age group, gender, calendar period and possibly a cancer type or other confounding variable. Also a social economic status or area variable can be used. + +In reference population the expected rate in strata $j$ is $\lambda_j = d_j$ / $n_j$, where $d_j$ is observed cases and $n_j$ is observed person years. Now the SIR can be written as a ratio +$$ +SIR = \frac{ \sum d_j }{\sum n_j \lambda_j} = \frac{D}{E} +$$ +where $D$ is the observed cases in cohort population and $E$ is the expected number. Univariate confidence intervals are based on exact values of Poisson distribution and the formula for p-value is +$$ +\chi^2 = \frac{ (|O - E| -0.5)^2 }{E}. +$$ +Modelled SIR is a Poisson regression model with log-link and cohorts person-years as a offset. + +The homogenity of SIR's can be tested using a likelihood ratio test in Poisson modelled SIRs. + +The same workflow applies for standardised mortality ratios. + +# Splines + +A continuous spline function can be fitted for time variables, e.g. age-group. Idea of the splines is to smooth the SMR estimates and do inference from the curve figure. This requires pre-defined knots/nodes that are used to fit the spline curve. Selecting the number of knots and knot places is a very subjective matter and there are three options to pass spline knots to function. + +It's good practice to try between different knot settings for realistic spline estimates. Overfitting might cause unintentioal artefacts in the estimate and underfitting might smooth away interesting patterns. + +The spline variable should be as continuous as possible, say from 18 to 100 time points. But when splitting time in too narrow intervals, random variation might occur in the expected or population rate values. Therefore it's also possible to do two variables for age or period: first with wider intervals for standardation and second with narrow intervals for the spline. + +## Knots + +There are three options to for assigning knots to the spline: + +1. A vector of numbers of knots for each spline variable. Number of knots includes the boundary knots, so that the minumum number of knots is 2, which is a log linear assosiation. The knots are placed automatically using the quantiles of observed cases. + +2. A list of vectors of predefined knot places. Number of vectors needs to match the length of spline variables. And each vector has to have at least the minimum and maximum for boundary knots. + +3. NULL will automatically finds the optimal number of knots based on AIC. Knots are placed according the quantiles of observed cases. This is usually a good place to start the fitting process. + +Number of knots and knot places are always found in output. + +# SMR + +## Mortality: External cohort and popmort data + +Estimate SMR of a simulated cohort of Finnish female rectal cancer patients, `sire`. +Death rates for each age, period and sex is available in `popmort` dataset. + +For more information about the dataset see `help(popmort)` and `help(sire)`. + +```{r} +data(sire) +data(popmort) +c <- lexpand( sire, status = status, birth = bi_date, exit = ex_date, entry = dg_date, + breaks = list(per = 1950:2013, age = 1:100, fot = c(0,10,20,Inf)), + aggre = list(fot, agegroup = age, year = per, sex) ) + +se <- sir( coh.data = c, coh.obs = 'from0to2', coh.pyrs = 'pyrs', + ref.data = popmort, ref.rate = 'haz', + adjust = c('agegroup','year','sex'), print ='fot') +se +``` + +SMR's for other causes is 1 for both follow-up intervals. Also the p-value suggest that there is no heterogenity between SMR estimates (p=0.735). + + +The total mortality can be estimated by modifying the `status` argument. Now we want to account all deaths, i.e. status is 1 or 2. + +```{r} +c <- lexpand( sire, status = status %in% 1:2, birth = bi_date, exit = ex_date, entry = dg_date, + breaks = list(per = 1950:2013, age = 1:100, fot = c(0,10,20,Inf)), + aggre = list(fot, agegroup = age, year = per, sex) ) + +se <- sir( coh.data = c, coh.obs = 'from0to1', coh.pyrs = 'pyrs', + ref.data = popmort, ref.rate = 'haz', + adjust = c('agegroup','year','sex'), print ='fot') +se +``` + +Now the estimates for follow-up intervals seems to differ significantly, p = 0. Plotting SMR (S3-method for `sir`-object) is easily done using default plot-function. + +```{r, fig.height=3, fig.width=6} +plot(se, col = 2:3) +title('SMR for follow-up categories') +``` + + +## splines + + +Lets fit splines for the follow-up time and agegroup using two different options: the splines are fitted in different model and in same model, `dependent.splines`. + +```{r, fig.height=5, fig.width=6} +c <- lexpand( sire, status = status %in% 1:2, birth = bi_date, exit = ex_date, entry = dg_date, + breaks = list(per = 1950:2013, age = 1:100, fot = 0:50), + aggre = list(fot, agegroup = age, year = per, sex) ) + +sf <- sirspline( coh.data = c, coh.obs = 'from0to1', coh.pyrs = 'pyrs', + ref.data = popmort, ref.rate = 'haz', + adjust = c('agegroup','year','sex'), + spline = c('agegroup','fot'), dependent.splines=FALSE) + +st <- sirspline( coh.data = c, coh.obs = 'from0to1', coh.pyrs = 'pyrs', + ref.data = popmort, ref.rate = 'haz', + adjust = c('agegroup','year','sex'), + spline = c('agegroup','fot'), dependent.splines = TRUE) + +plot(sf, col=2, log=TRUE) +title('Splines fitted in different models') + +plot(st, col=4, log=TRUE) +title('Splines are dependent') +``` + +In dependent spline the `fot` is the ratio with zero time as reference point. Reference points can be alterned. Here agegroup profile is assumed to be same for every follow-up time. SMR is 0.2 times from 0 to 10 years of follow-up. + + +Splines can also be stratified using the `print` argument. For example we split the death time in two time periods and test if the agegroup splines are equal. + +```{r, results='hide', fig.height=5, fig.width=6} +c$year.cat <- ifelse(c$year < 2002, 1, 2) +sy <- sirspline( coh.data = c, coh.obs = 'from0to1', coh.pyrs = 'pyrs', + ref.data = popmort, ref.rate = 'haz', + adjust = c('agegroup','year','sex'), + spline = c('agegroup'), print = 'year.cat') +plot(sy, log=TRUE) +legend('topright', c('before 2002','after 2002'), lty=1, col=c(1,2)) +``` + +For category before 2002 the SMR seems to be higher after the age of 50. Also the p-value (<0.0001) indicates that there is a difference in age group trends before and after year 2002. P-value is a likelihood ratio test that compares models where splines are fitted together and separately. + +```{r} +print(sy) +``` + + + + + + + diff --git a/vignettes/survtab_examples.Rmd b/vignettes/survtab_examples.Rmd index 66c1851..0db0570 100644 --- a/vignettes/survtab_examples.Rmd +++ b/vignettes/survtab_examples.Rmd @@ -1,296 +1,296 @@ ---- -title: "Examples of using survtab" -author: "Joonas Miettinen" -date: "`r Sys.Date()`" -output: - html_document: - toc: true - toc_depth: 2 - fig_width: 6 - fig_height: 6 -vignette: > - %\VignetteEngine{knitr::rmarkdown} - %\VignetteIndexEntry{survtab examples} - %\usepackage[utf8]{inputenc} ---- - -# Overview - -This vignette aims to clarify the usage of the `survtab_ag` and `survtab` functions included in this package. `survtab_ag` estimates various survival functions and cumulative incidence functions (CIFs) non-parametrically using aggregated data, and `survtab` is a wrapper for `survtab_ag`, to which `Lexis` data is supplied. - -Two methods (`surv.method`) are currently supported: The `"lifetable"` (actuarial) method only makes use of counts when estimating any of the supported survival time functions. The default method (`"hazard"`}) estimates appropriate hazards and transforms them into survival function or CIF estimates. - -For relative survival estimation we need also to enumerate the expected hazard levels for the subjects in the data. This is done by merging expected hazards to individuals' subintervals (which divide their survival time lines to a number of small intervals). For Pohar-Perme-weighted analyses one must additionally compute various weighted figures at the level of split subject data. - -If one has subject-level data, the simplest way of computing survival function estimates with `popEpi` is by defining a `Lexis` object and using `survtab`, which will do the rest. For pre-aggregated data one may use the `survtab_ag` function instead. One can also use the `lexpand` function to split, merge population hazards, and aggregate in a single function call and then use `survtab_ag` if that is convenient. - -# Using `survtab` - -It is straightforward to estimate various survival time functions with `survtab` once a `Lexis` object has been defined (see `?Lexis` in package `Epi` for details): - -```{r pkgs, eval = TRUE, echo = TRUE, message = FALSE} -library(popEpi) -library(Epi) -library(survival) -``` - -```{r} -data(sire) - -## NOTE: recommended to use factor status variable -x <- Lexis(entry = list(FUT = 0, AGE = dg_age, CAL = get.yrs(dg_date)), - exit = list(CAL = get.yrs(ex_date)), - data = sire[sire$dg_date < sire$ex_date, ], - exit.status = factor(status, levels = 0:2, - labels = c("alive", "canD", "othD")), - merge = TRUE) - -## pretend some are male -set.seed(1L) -x$sex <- rbinom(nrow(x), 1, 0.5) - -## observed survival - explicit method -st <- survtab(Surv(time = FUT, event = lex.Xst) ~ sex, data = x, - surv.type = "surv.obs", - breaks = list(FUT = seq(0, 5, 1/12))) - -## observed survival - easy method (assumes lex.Xst in x is the status variable) -st <- survtab(FUT ~ sex, data = x, - surv.type = "surv.obs", - breaks = list(FUT = seq(0, 5, 1/12))) - -## printing gives the used settings and -## estimates at the middle and end of the estimated -## curves; more information available using summary() -st - -``` - -Plotting by strata (men = blue, women = red): - -```{r} -plot(st, col = c("blue", "red")) -``` - -Note that the correct usage of the `formula` argument in `survtab` specifies the time scale in the `Lexis` object over which survival is computed (here `"FUT"` for follow-up time). This is used to identify the appropriate time scale in the data. When only supplying the survival time scale as the right-hand-side of the formula, the column `lex.Xst` in the supplied `Lexis` object is assumed to be the (correctly formatted!) status variable. When using `Surv()` to be explicit, we effectively (and exceptionally) pass the starting times to the `time` argument in `Surv()`, and `time2` is ignored entirely. The function will fail if `time` does not match exactly with a time scale in data. - -When using `Surv()`, one must also pass the status variable, which can be something other than the `lex.Xst` variable created by `Lexis()`, though usually ``lex.Xst` is what you want to use (especially if the data has already been split using e.g. `splitLexis` or `splitMulti`, which is allowed). It is recommended to use a factor status variable to pass to `Surv()`, though a numeric variable will work in simple cases (0 = alive, 1 = dead; also `FALSE` = alive, `TRUE` = dead). Using `Surv()` also allows easy passing of transformations of `lex.Xst`, e.g. `Surv(FUT, lex.Xst %in% 1:2)`. - -The argument `breaks` must be a named list of breaks by which to split the `Lexis` data (see `?splitMulti`). It is mandatory to assign breaks at least to the survival time scale (`"FUT"` in our example) so that `survtab` knows what intervals to use to estimate the requested survival time function(s). The breaks also determine the window used: It is therefore easy to compute so called period estimates by defining the roof and floor along the calendar time scale, e.g. - -`breaks = list(FUT = seq(0, 5, 1/12), CAL = c(2000, 2005))` - -would cause `survtab` to compute period estimates for 2000-2004 (breaks given here as fractional years, so 2005 is effectively 2004.99999...). - -## Relative/net survival - -Relative/net survival estimation requires knowledge of the expected hazard levels for the individuals in the data. In `survtab` this is accomplished by passing a long-formt `data.frame` of population hazards via the `pophaz` argument. E.g. the `popmort` dataset included in `popEpi` (Finnish overall mortality rates for men and women). - -```{r popmort} -data(popmort) -pm <- data.frame(popmort) -names(pm) <- c("sex", "CAL", "AGE", "haz") -head(pm) -``` - -The `data.frame` should contain a variable named `"haz"` indicating the population hazard at the level of one subject-year. Any other variables are considered to be variables, by which to merge population hazards to the (split) subject-level data within `survtab`. These merging variables may correspond to the time scales in the used `Lexis` object. This allows for e.g. merging in different population hazards for the same subject as they get older. - -The following causes `survtab` to estimate EdererII relative survival: - -```{r survtab_e2} -st.e2 <- survtab(Surv(time = FUT, event = lex.Xst) ~ sex, data = x, - surv.type = "surv.rel", relsurv.method = "e2", - breaks = list(FUT = seq(0, 5, 1/12)), - pophaz = pm) -``` - -```{r} -plot(st.e2, y = "r.e2", col = c("blue", "red")) -``` - -Note that the curves diverge due to merging in the "wrong" population hazards for some individuals which we randomized earlier to be male though all the individuals in data are actually female. Pohar-Perme-weighted estimates can be computed by - -```{r survtab_pp} -st.pp <- survtab(Surv(time = FUT, event = lex.Xst) ~ sex, data = x, - surv.type = "surv.rel", relsurv.method = "pp", - breaks = list(FUT = seq(0, 5, 1/12)), - pophaz = pm) -``` - -Compare with EdererII estimates: - -```{r} -plot(st.e2, y = "r.e2", col = c("blue", "red"), lty = 1) -lines(st.pp, y = "r.pp", col = c("blue", "red"), lty = 2) -``` - -## Adjusting estimates - -`survtab` also allows for adjusting the survival curves by categorical variables --- typically by age groups. The following demonstrates how: - -```{r survtab_adjust} -## an age group variable -x$agegr <- cut(x$dg_age, c(0, 60, 70, 80, Inf), right = FALSE) - -## using "internal weights" - see ?ICSS for international weights standards -w <- table(x$agegr) -w - -w <- list(agegr = as.numeric(w)) -``` - -```{r survtab_adjust_2} -st.as <- survtab(Surv(time = FUT, event = lex.Xst) ~ sex + adjust(agegr), - data = x, weights = w, - surv.type = "surv.rel", relsurv.method = "e2", - breaks = list(FUT = seq(0, 5, 1/12)), - pophaz = pm) -``` - -```{r} -plot(st.as, y = "r.e2.as", col = c("blue", "red")) -``` - -We now have age-adjusted EdererII relative/net survival estimates. The `weights` argument allows for either a list of weigths (with one or multiple variables to adjust by) or a `data.frame` of weights. Examples: - -```{r weights_examples, eval = TRUE} -list(sex = c(0.4, 0.6), agegr = c(0.2, 0.2, 0.4, 0.2)) - -wdf <- merge(0:1, 1:4) -names(wdf) <- c("sex", "agegr") -wdf$weights <- c(0.1, 0.1, 0.1, 0.1, 0.2, 0.2, 0.1, 0.1) -wdf -``` - -The weights do not have to sum to one when supplied as they are internally forced to do so within each stratum. In the `data.frame` of weights, the column of actual weights to use must be named "weights". When there are more than one variable to adjust by, and a list of weights has been supplied, the variable-specific weights are first multiplied together (cumulatively) and then scaled to sum to one. - -This adjusting can be done to any survival time function that `survtab` (and `survtab_ag`) estimates. One can also supply adjusting variables via the `adjust` argument if convenient: - -```{r survtab_adjust_3} -st.as <- survtab(Surv(time = FUT, event = lex.Xst) ~ sex, - adjust = "agegr", - data = x, weights = w, - surv.type = "surv.rel", relsurv.method = "e2", - breaks = list(FUT = seq(0, 5, 1/12)), - pophaz = pm) -``` - -Where `adjust` could also be `adjust = agegr`, `adjust = list(agegr)` or - -`adjust = list(agegr = cut(dg_age, c(0, 60, 70, 80, Inf), right = FALSE))` - -for exactly the same results. When adjusting by multiple variables, one must supply a vector of variable names in data or a list of multiple elements (as in the base function `aggregate`). - -## Other survival time functions - -One can also estimate cause-specific survival functions, cumulative incidence functions (CIFs, a.k.a. crude risk a.k.a. absolute risk functions), and CIFs based on the excess numbers of events. Cause-specific survival is close to net survival as they are philosophically highly similar concepts: - -```{r survtab_cause} -st.ca <- survtab(Surv(time = FUT, event = lex.Xst) ~ 1, - data = x, - surv.type = "surv.cause", - breaks = list(FUT = seq(0, 5, 1/12))) - -st.pp <- survtab(Surv(time = FUT, event = lex.Xst) ~ 1, data = x, - surv.type = "surv.rel", relsurv.method = "pp", - breaks = list(FUT = seq(0, 5, 1/12)), - pophaz = pm) - -plot(st.ca, y = "surv.obs.canD", col = "blue") -lines(st.pp, y = "r.pp", col = "red") -``` - -Absolute risk: - -```{r survtab_cif} -st.cif <- survtab(Surv(time = FUT, event = lex.Xst) ~ 1, - data = x, - surv.type = "cif.obs", - breaks = list(FUT = seq(0, 5, 1/12))) - -plot(st.cif, y = "CIF_canD", conf.int = FALSE) -lines(st.cif, y = "CIF_othD", conf.int = FALSE, col = "red") -``` - -The "relative CIF" attempts to be close to the true CIF without using knowledge about the types of events, e.g. causes of death: - -```{r survtab_relcif} -st.cir <- survtab(Surv(time = FUT, event = lex.Xst) ~ 1, - data = x, - surv.type = "cif.rel", - breaks = list(FUT = seq(0, 5, 1/12)), - pophaz = pm) -plot(st.cif, y = "CIF_canD", conf.int = FALSE, col = "blue") -lines(st.cir, y = "CIF.rel", conf.int = FALSE, col = "red") -``` - - -# Using `survtab_ag` - -Arguments concerning the types and methods of estimating of survival time functions work the same in `survtab_ag` as in `survtab` (the latter uses the former). However, with aggregated data one must explicitly supply the various count and person-time variables. Also, usage of the `formula` argument is different. - -For demonstration purposes we form an aggregated data set using `lexpand`; see `?lexpand` for more information on that function. - -```{r} -sire$sex <- rbinom(nrow(sire), size = 1, prob = 0.5) -ag <- lexpand(sire, birth = "bi_date", entry = "dg_date", exit = "ex_date", - status = "status", breaks = list(fot = seq(0, 5, 1/12)), - aggre = list(sex, fot)) -head(ag) -``` - -Now simply do: - -```{r survtab_ag_example1} -st <- survtab_ag(fot ~ sex, data = ag, surv.type = "surv.obs", - surv.method = "hazard", - d = c("from0to1", "from0to2"), pyrs = "pyrs") -``` - -Or: - -```{r survtab_ag_example2} -st <- survtab_ag(fot ~ sex, data = ag, surv.type = "surv.obs", - surv.method = "lifetable", - d = c("from0to1", "from0to2"), n = "at.risk", - n.cens = "from0to0") -``` - -Note that e.g. argument `d` could also have been supplied as - -`list(from0to1, from0to2)` - -or - -`list(canD = from0to1, othD = from0to2)` - -for identical results. The last is convenient for e.g. `surv.cause` computations: - -```{r survtab_ag_cause} -st.ca <- survtab_ag(fot ~ sex, data = ag, surv.type = "surv.cause", - surv.method = "hazard", - d = list(canD = from0to1, othD = from0to2), pyrs = "pyrs") -plot(st.ca, y = "surv.obs.canD", col = c("blue", "red")) -``` - -One has to supply the most variables when computing Pohar-Perme estimates (though it is probably rare to have third-source aggregated data with Pohar-Perme weighted figures, it is implemented here to be used as a workhorse for `survtab`). For this we must aggregate again to get the Pohar-Perme weighted counts and subject-times: - -```{r} -ag <- lexpand(sire, birth = "bi_date", entry = "dg_date", exit = "ex_date", - status = "status", breaks = list(fot = seq(0, 5, 1/12)), - pophaz = popmort, pp = TRUE, - aggre = list(sex, fot)) - -st.pp <- survtab_ag(fot ~ sex, data = ag, surv.type = "surv.rel", - surv.method = "hazard", relsurv.method = "pp", - d = list(from0to1 + from0to2), pyrs = "pyrs", - d.pp = list(from0to1.pp + from0to2.pp), - d.pp.2 = list(from0to1.pp.2 + from0to2.pp.2), - pyrs.pp = "ptime.pp", d.exp.pp = "d.exp.pp") -plot(st.pp, y = "r.pp", col = c("blue", "red")) -``` - -Here it is best to supply only one column to each argument since Pohar-Perme estimates will not be computed for several types of events at the same time. - - - +--- +title: "Examples of using survtab" +author: "Joonas Miettinen" +date: "`r Sys.Date()`" +output: + html_document: + toc: true + toc_depth: 2 + fig_width: 6 + fig_height: 6 +vignette: > + %\VignetteEngine{knitr::rmarkdown} + %\VignetteIndexEntry{survtab examples} + %\usepackage[utf8]{inputenc} +--- + +# Overview + +This vignette aims to clarify the usage of the `survtab_ag` and `survtab` functions included in this package. `survtab_ag` estimates various survival functions and cumulative incidence functions (CIFs) non-parametrically using aggregated data, and `survtab` is a wrapper for `survtab_ag`, to which `Lexis` data is supplied. + +Two methods (`surv.method`) are currently supported: The `"lifetable"` (actuarial) method only makes use of counts when estimating any of the supported survival time functions. The default method (`"hazard"`}) estimates appropriate hazards and transforms them into survival function or CIF estimates. + +For relative survival estimation we need also to enumerate the expected hazard levels for the subjects in the data. This is done by merging expected hazards to individuals' subintervals (which divide their survival time lines to a number of small intervals). For Pohar-Perme-weighted analyses one must additionally compute various weighted figures at the level of split subject data. + +If one has subject-level data, the simplest way of computing survival function estimates with `popEpi` is by defining a `Lexis` object and using `survtab`, which will do the rest. For pre-aggregated data one may use the `survtab_ag` function instead. One can also use the `lexpand` function to split, merge population hazards, and aggregate in a single function call and then use `survtab_ag` if that is convenient. + +# Using `survtab` + +It is straightforward to estimate various survival time functions with `survtab` once a `Lexis` object has been defined (see `?Lexis` in package `Epi` for details): + +```{r pkgs, eval = TRUE, echo = TRUE, message = FALSE} +library(popEpi) +library(Epi) +library(survival) +``` + +```{r} +data(sire) + +## NOTE: recommended to use factor status variable +x <- Lexis(entry = list(FUT = 0, AGE = dg_age, CAL = get.yrs(dg_date)), + exit = list(CAL = get.yrs(ex_date)), + data = sire[sire$dg_date < sire$ex_date, ], + exit.status = factor(status, levels = 0:2, + labels = c("alive", "canD", "othD")), + merge = TRUE) + +## pretend some are male +set.seed(1L) +x$sex <- rbinom(nrow(x), 1, 0.5) + +## observed survival - explicit method +st <- survtab(Surv(time = FUT, event = lex.Xst) ~ sex, data = x, + surv.type = "surv.obs", + breaks = list(FUT = seq(0, 5, 1/12))) + +## observed survival - easy method (assumes lex.Xst in x is the status variable) +st <- survtab(FUT ~ sex, data = x, + surv.type = "surv.obs", + breaks = list(FUT = seq(0, 5, 1/12))) + +## printing gives the used settings and +## estimates at the middle and end of the estimated +## curves; more information available using summary() +st + +``` + +Plotting by strata (men = blue, women = red): + +```{r} +plot(st, col = c("blue", "red")) +``` + +Note that the correct usage of the `formula` argument in `survtab` specifies the time scale in the `Lexis` object over which survival is computed (here `"FUT"` for follow-up time). This is used to identify the appropriate time scale in the data. When only supplying the survival time scale as the right-hand-side of the formula, the column `lex.Xst` in the supplied `Lexis` object is assumed to be the (correctly formatted!) status variable. When using `Surv()` to be explicit, we effectively (and exceptionally) pass the starting times to the `time` argument in `Surv()`, and `time2` is ignored entirely. The function will fail if `time` does not match exactly with a time scale in data. + +When using `Surv()`, one must also pass the status variable, which can be something other than the `lex.Xst` variable created by `Lexis()`, though usually ``lex.Xst` is what you want to use (especially if the data has already been split using e.g. `splitLexis` or `splitMulti`, which is allowed). It is recommended to use a factor status variable to pass to `Surv()`, though a numeric variable will work in simple cases (0 = alive, 1 = dead; also `FALSE` = alive, `TRUE` = dead). Using `Surv()` also allows easy passing of transformations of `lex.Xst`, e.g. `Surv(FUT, lex.Xst %in% 1:2)`. + +The argument `breaks` must be a named list of breaks by which to split the `Lexis` data (see `?splitMulti`). It is mandatory to assign breaks at least to the survival time scale (`"FUT"` in our example) so that `survtab` knows what intervals to use to estimate the requested survival time function(s). The breaks also determine the window used: It is therefore easy to compute so called period estimates by defining the roof and floor along the calendar time scale, e.g. + +`breaks = list(FUT = seq(0, 5, 1/12), CAL = c(2000, 2005))` + +would cause `survtab` to compute period estimates for 2000-2004 (breaks given here as fractional years, so 2005 is effectively 2004.99999...). + +## Relative/net survival + +Relative/net survival estimation requires knowledge of the expected hazard levels for the individuals in the data. In `survtab` this is accomplished by passing a long-formt `data.frame` of population hazards via the `pophaz` argument. E.g. the `popmort` dataset included in `popEpi` (Finnish overall mortality rates for men and women). + +```{r popmort} +data(popmort) +pm <- data.frame(popmort) +names(pm) <- c("sex", "CAL", "AGE", "haz") +head(pm) +``` + +The `data.frame` should contain a variable named `"haz"` indicating the population hazard at the level of one subject-year. Any other variables are considered to be variables, by which to merge population hazards to the (split) subject-level data within `survtab`. These merging variables may correspond to the time scales in the used `Lexis` object. This allows for e.g. merging in different population hazards for the same subject as they get older. + +The following causes `survtab` to estimate EdererII relative survival: + +```{r survtab_e2} +st.e2 <- survtab(Surv(time = FUT, event = lex.Xst) ~ sex, data = x, + surv.type = "surv.rel", relsurv.method = "e2", + breaks = list(FUT = seq(0, 5, 1/12)), + pophaz = pm) +``` + +```{r} +plot(st.e2, y = "r.e2", col = c("blue", "red")) +``` + +Note that the curves diverge due to merging in the "wrong" population hazards for some individuals which we randomized earlier to be male though all the individuals in data are actually female. Pohar-Perme-weighted estimates can be computed by + +```{r survtab_pp} +st.pp <- survtab(Surv(time = FUT, event = lex.Xst) ~ sex, data = x, + surv.type = "surv.rel", relsurv.method = "pp", + breaks = list(FUT = seq(0, 5, 1/12)), + pophaz = pm) +``` + +Compare with EdererII estimates: + +```{r} +plot(st.e2, y = "r.e2", col = c("blue", "red"), lty = 1) +lines(st.pp, y = "r.pp", col = c("blue", "red"), lty = 2) +``` + +## Adjusting estimates + +`survtab` also allows for adjusting the survival curves by categorical variables --- typically by age groups. The following demonstrates how: + +```{r survtab_adjust} +## an age group variable +x$agegr <- cut(x$dg_age, c(0, 60, 70, 80, Inf), right = FALSE) + +## using "internal weights" - see ?ICSS for international weights standards +w <- table(x$agegr) +w + +w <- list(agegr = as.numeric(w)) +``` + +```{r survtab_adjust_2} +st.as <- survtab(Surv(time = FUT, event = lex.Xst) ~ sex + adjust(agegr), + data = x, weights = w, + surv.type = "surv.rel", relsurv.method = "e2", + breaks = list(FUT = seq(0, 5, 1/12)), + pophaz = pm) +``` + +```{r} +plot(st.as, y = "r.e2.as", col = c("blue", "red")) +``` + +We now have age-adjusted EdererII relative/net survival estimates. The `weights` argument allows for either a list of weigths (with one or multiple variables to adjust by) or a `data.frame` of weights. Examples: + +```{r weights_examples, eval = TRUE} +list(sex = c(0.4, 0.6), agegr = c(0.2, 0.2, 0.4, 0.2)) + +wdf <- merge(0:1, 1:4) +names(wdf) <- c("sex", "agegr") +wdf$weights <- c(0.1, 0.1, 0.1, 0.1, 0.2, 0.2, 0.1, 0.1) +wdf +``` + +The weights do not have to sum to one when supplied as they are internally forced to do so within each stratum. In the `data.frame` of weights, the column of actual weights to use must be named "weights". When there are more than one variable to adjust by, and a list of weights has been supplied, the variable-specific weights are first multiplied together (cumulatively) and then scaled to sum to one. + +This adjusting can be done to any survival time function that `survtab` (and `survtab_ag`) estimates. One can also supply adjusting variables via the `adjust` argument if convenient: + +```{r survtab_adjust_3} +st.as <- survtab(Surv(time = FUT, event = lex.Xst) ~ sex, + adjust = "agegr", + data = x, weights = w, + surv.type = "surv.rel", relsurv.method = "e2", + breaks = list(FUT = seq(0, 5, 1/12)), + pophaz = pm) +``` + +Where `adjust` could also be `adjust = agegr`, `adjust = list(agegr)` or + +`adjust = list(agegr = cut(dg_age, c(0, 60, 70, 80, Inf), right = FALSE))` + +for exactly the same results. When adjusting by multiple variables, one must supply a vector of variable names in data or a list of multiple elements (as in the base function `aggregate`). + +## Other survival time functions + +One can also estimate cause-specific survival functions, cumulative incidence functions (CIFs, a.k.a. crude risk a.k.a. absolute risk functions), and CIFs based on the excess numbers of events. Cause-specific survival is close to net survival as they are philosophically highly similar concepts: + +```{r survtab_cause} +st.ca <- survtab(Surv(time = FUT, event = lex.Xst) ~ 1, + data = x, + surv.type = "surv.cause", + breaks = list(FUT = seq(0, 5, 1/12))) + +st.pp <- survtab(Surv(time = FUT, event = lex.Xst) ~ 1, data = x, + surv.type = "surv.rel", relsurv.method = "pp", + breaks = list(FUT = seq(0, 5, 1/12)), + pophaz = pm) + +plot(st.ca, y = "surv.obs.canD", col = "blue") +lines(st.pp, y = "r.pp", col = "red") +``` + +Absolute risk: + +```{r survtab_cif} +st.cif <- survtab(Surv(time = FUT, event = lex.Xst) ~ 1, + data = x, + surv.type = "cif.obs", + breaks = list(FUT = seq(0, 5, 1/12))) + +plot(st.cif, y = "CIF_canD", conf.int = FALSE) +lines(st.cif, y = "CIF_othD", conf.int = FALSE, col = "red") +``` + +The "relative CIF" attempts to be close to the true CIF without using knowledge about the types of events, e.g. causes of death: + +```{r survtab_relcif} +st.cir <- survtab(Surv(time = FUT, event = lex.Xst) ~ 1, + data = x, + surv.type = "cif.rel", + breaks = list(FUT = seq(0, 5, 1/12)), + pophaz = pm) +plot(st.cif, y = "CIF_canD", conf.int = FALSE, col = "blue") +lines(st.cir, y = "CIF.rel", conf.int = FALSE, col = "red") +``` + + +# Using `survtab_ag` + +Arguments concerning the types and methods of estimating of survival time functions work the same in `survtab_ag` as in `survtab` (the latter uses the former). However, with aggregated data one must explicitly supply the various count and person-time variables. Also, usage of the `formula` argument is different. + +For demonstration purposes we form an aggregated data set using `lexpand`; see `?lexpand` for more information on that function. + +```{r} +sire$sex <- rbinom(nrow(sire), size = 1, prob = 0.5) +ag <- lexpand(sire, birth = "bi_date", entry = "dg_date", exit = "ex_date", + status = "status", breaks = list(fot = seq(0, 5, 1/12)), + aggre = list(sex, fot)) +head(ag) +``` + +Now simply do: + +```{r survtab_ag_example1} +st <- survtab_ag(fot ~ sex, data = ag, surv.type = "surv.obs", + surv.method = "hazard", + d = c("from0to1", "from0to2"), pyrs = "pyrs") +``` + +Or: + +```{r survtab_ag_example2} +st <- survtab_ag(fot ~ sex, data = ag, surv.type = "surv.obs", + surv.method = "lifetable", + d = c("from0to1", "from0to2"), n = "at.risk", + n.cens = "from0to0") +``` + +Note that e.g. argument `d` could also have been supplied as + +`list(from0to1, from0to2)` + +or + +`list(canD = from0to1, othD = from0to2)` + +for identical results. The last is convenient for e.g. `surv.cause` computations: + +```{r survtab_ag_cause} +st.ca <- survtab_ag(fot ~ sex, data = ag, surv.type = "surv.cause", + surv.method = "hazard", + d = list(canD = from0to1, othD = from0to2), pyrs = "pyrs") +plot(st.ca, y = "surv.obs.canD", col = c("blue", "red")) +``` + +One has to supply the most variables when computing Pohar-Perme estimates (though it is probably rare to have third-source aggregated data with Pohar-Perme weighted figures, it is implemented here to be used as a workhorse for `survtab`). For this we must aggregate again to get the Pohar-Perme weighted counts and subject-times: + +```{r} +ag <- lexpand(sire, birth = "bi_date", entry = "dg_date", exit = "ex_date", + status = "status", breaks = list(fot = seq(0, 5, 1/12)), + pophaz = popmort, pp = TRUE, + aggre = list(sex, fot)) + +st.pp <- survtab_ag(fot ~ sex, data = ag, surv.type = "surv.rel", + surv.method = "hazard", relsurv.method = "pp", + d = list(from0to1 + from0to2), pyrs = "pyrs", + d.pp = list(from0to1.pp + from0to2.pp), + d.pp.2 = list(from0to1.pp.2 + from0to2.pp.2), + pyrs.pp = "ptime.pp", d.exp.pp = "d.exp.pp") +plot(st.pp, y = "r.pp", col = c("blue", "red")) +``` + +Here it is best to supply only one column to each argument since Pohar-Perme estimates will not be computed for several types of events at the same time. + + +