From 782acafdc0d10c0403e06d9319f8f1525f9f5858 Mon Sep 17 00:00:00 2001 From: Keith Goldfeld Date: Thu, 23 Nov 2023 01:00:02 +0000 Subject: [PATCH] version 0.7.1 --- DESCRIPTION | 14 +- MD5 | 143 +++--- NAMESPACE | 2 + R/RcppExports.R | 12 + R/add_correlated_data.R | 24 +- R/generate_correlated_data.R | 2 + R/generate_dist.R | 1 + R/group_data.R | 10 +- R/internal_utility.R | 38 +- R/utility.R | 161 ++++++- build/vignette.rds | Bin 487 -> 527 bytes inst/doc/clustered.R | 13 +- inst/doc/clustered.Rmd | 4 + inst/doc/clustered.html | 72 +-- inst/doc/corelationmat.R | 35 +- inst/doc/corelationmat.Rmd | 3 + inst/doc/corelationmat.html | 320 +++++++------ inst/doc/correlated.R | 11 +- inst/doc/correlated.Rmd | 4 + inst/doc/correlated.html | 310 ++++++------ inst/doc/double_dot_extension.R | 13 +- inst/doc/double_dot_extension.Rmd | 8 +- inst/doc/double_dot_extension.html | 190 ++++---- inst/doc/logisticCoefs.R | 140 ++++++ inst/doc/logisticCoefs.Rmd | 248 ++++++++++ inst/doc/logisticCoefs.html | 625 ++++++++++++++++++++++++ inst/doc/longitudinal.R | 19 +- inst/doc/longitudinal.Rmd | 4 + inst/doc/longitudinal.html | 129 ++--- inst/doc/missing.R | 17 +- inst/doc/missing.Rmd | 4 + inst/doc/missing.html | 168 +++---- inst/doc/ordinal.R | 9 +- inst/doc/ordinal.Rmd | 4 + inst/doc/ordinal.html | 116 ++--- inst/doc/simstudy.R | 11 +- inst/doc/simstudy.Rmd | 4 + inst/doc/simstudy.html | 170 +++---- inst/doc/spline.R | 15 +- inst/doc/spline.Rmd | 4 + inst/doc/spline.html | 118 ++--- inst/doc/survival.R | 39 +- inst/doc/survival.Rmd | 3 + inst/doc/survival.html | 628 ++++++++++++------------- inst/doc/treat_and_exposure.R | 21 +- inst/doc/treat_and_exposure.Rmd | 4 + inst/doc/treat_and_exposure.html | 186 ++++---- man/addCompRisk.Rd | 4 +- man/addCorGen.Rd | 6 +- man/genCorFlex.Rd | 2 + man/logisticCoefs.Rd | 86 ++++ src/RcppExports.cpp | 43 ++ src/srcRcpp.cpp | 170 ++++++- tests/testthat.R | 2 +- tests/testthat/test-add_data.R | 9 + tests/testthat/test-conditions.R | 2 + tests/testthat/test-define_data.R | 130 ++--- tests/testthat/test-glue.R | 3 + tests/testthat/test-group_data.R | 2 + tests/testthat/test-internal_utility.R | 11 + tests/testthat/test-missing_data.R | 2 +- tests/testthat/test-survival.R | 4 + tests/testthat/test-utility.R | 111 +++++ vignettes/clustered.Rmd | 4 + vignettes/corelationmat.Rmd | 3 + vignettes/correlated.Rmd | 4 + vignettes/double_dot_extension.Rmd | 8 +- vignettes/logisticCoefs.Rmd | 248 ++++++++++ vignettes/longitudinal.Rmd | 4 + vignettes/missing.Rmd | 4 + vignettes/ordinal.Rmd | 4 + vignettes/simstudy.Rmd | 4 + vignettes/spline.Rmd | 4 + vignettes/survival.Rmd | 3 + vignettes/treat_and_exposure.Rmd | 4 + 75 files changed, 3498 insertions(+), 1459 deletions(-) create mode 100644 inst/doc/logisticCoefs.R create mode 100644 inst/doc/logisticCoefs.Rmd create mode 100644 inst/doc/logisticCoefs.html create mode 100644 man/logisticCoefs.Rd create mode 100644 vignettes/logisticCoefs.Rmd diff --git a/DESCRIPTION b/DESCRIPTION index 8dfb750..cfee215 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Type: Package Package: simstudy Title: Simulation of Study Data -Version: 0.7.0 -Date: 2023-06-01 +Version: 0.7.1 +Date: 2023-11-22 Authors@R: c(person(given = "Keith", family = "Goldfeld", @@ -27,19 +27,19 @@ URL: https://github.com/kgoldfeld/simstudy, https://kgoldfeld.github.io/simstudy/dev/ BugReports: https://github.com/kgoldfeld/simstudy/issues Depends: R (>= 3.3.0) -Imports: data.table, glue, methods, mvnfast, Rcpp, backports +Imports: data.table, glue, methods, mvnfast, Rcpp, backports, fastglm Suggests: covr, dplyr, formatR, gee, ggplot2, grid, gridExtra, hedgehog, knitr, magrittr, Matrix, mgcv, ordinal, pracma, rmarkdown, scales, splines, survival, testthat, gtsummary, - survminer, katex, dirmult -LinkingTo: Rcpp, pbv (>= 0.4-22) + survminer, katex, dirmult, rms +LinkingTo: Rcpp, pbv (>= 0.4-22), fastglm VignetteBuilder: knitr Encoding: UTF-8 RoxygenNote: 7.2.3 NeedsCompilation: yes -Packaged: 2023-06-01 13:27:29 UTC; keith +Packaged: 2023-11-23 00:44:30 UTC; keith Author: Keith Goldfeld [aut, cre] (), Jacob Wujciak-Jens [aut] () Maintainer: Keith Goldfeld Repository: CRAN -Date/Publication: 2023-06-01 13:40:02 UTC +Date/Publication: 2023-11-23 01:00:02 UTC diff --git a/MD5 b/MD5 index 20b6358..2803407 100644 --- a/MD5 +++ b/MD5 @@ -1,63 +1,66 @@ -5768254ba08b51701d69fdd2ec739442 *DESCRIPTION -7c818e3ed46458778f387d0021e37650 *NAMESPACE -d798fbbc376e4e00846bc064db2c89bd *R/RcppExports.R -fddd9b9ef73053798f26cbb437b98a97 *R/add_correlated_data.R +d33b2516838d83552207da013600848f *DESCRIPTION +0ee5ffbbc49469b7f8ae0afcd2f09929 *NAMESPACE +0793f5e5a9073a18b7977bfa0a788b22 *R/RcppExports.R +226bde196c0a71404f51de322fce1fc3 *R/add_correlated_data.R c38275463a38e976f007c33f5224ad75 *R/add_data.R 4f74c45c37321d387261f3a8ff915d12 *R/asserts.R 397712368768be6ac4559184845126ab *R/conditions.R c00827872741340bf5655a5c3f7f2ac7 *R/define_data.R -8332848d24bee84fe7dcb4d6fe7fa3a2 *R/generate_correlated_data.R +e71ec17618a8d6018dd15f667b8eb936 *R/generate_correlated_data.R cdef8dcc19c4f6e5c347ebbdd9c417cf *R/generate_data.R -9c7049e0dfd761edd3adfcbc8d5bf780 *R/generate_dist.R +126b186bd7939af458ce2b1e5ef017ac *R/generate_dist.R 20be8bf9cdf61bea6e5979cf49ed6b1d *R/glue.R -a820f49c023d7eab5d5613f5db143507 *R/group_data.R +5395e4742d44b7cd999046f2bc3c780e *R/group_data.R 9fd35a4187fadfc5db575f449d5f2872 *R/int_rmult.R -71d2aacb15727f69ae961c5eab154502 *R/internal_utility.R +6d8c27667aa28179728106fd3dcfe0ee *R/internal_utility.R 4ae9457df2c7c58c5b9923a8a9402841 *R/missing_data.R f2b3f8d6adddce59fdbb15409bcbda79 *R/simstudy-package.R -aea873b9d754c14b8bdd983afe320baf *R/utility.R +00f8db22cefd4ecad3a5e3befe263b9f *R/utility.R a1c1829b987daed614b734ae41c62bfe *R/zzz.R -dddf12dfa7e5faa1480b55fe04bfcef5 *build/vignette.rds +32b5ee05365ce8b6513b6a56a2dfbedf *build/vignette.rds c0752245cd4997d2b5ae10f945c9a7e0 *inst/CITATION -41e7ca8e8418a6daf4bc064d7feaedfa *inst/doc/clustered.R -8aaa3531fbd5ad77a75d78401cf466f6 *inst/doc/clustered.Rmd -05cf55122001c32c2c060677087a968c *inst/doc/clustered.html -703d7123ec12ed7fd30649d7c8832aaf *inst/doc/corelationmat.R -9a8dd81ce7dc9733cc4c595b483e7a05 *inst/doc/corelationmat.Rmd -c8a55ff4ea2b0acaf6ce4c3177335e36 *inst/doc/corelationmat.html -397bd3da4fd22f59d78440e07ffdaabd *inst/doc/correlated.R -adb2db9f7c4c38973f11cfac5765281e *inst/doc/correlated.Rmd -a51eb9f8453947d8da78c265f8800ab4 *inst/doc/correlated.html -28a70873f0a07d6369ea7f15c46ab411 *inst/doc/double_dot_extension.R -78e0fd72c2c05cfe43c18890b86e2486 *inst/doc/double_dot_extension.Rmd -b3b4d160cb3f99b05e2570ed7628f6db *inst/doc/double_dot_extension.html -eaa25902628ff3eae63e57e3b58c20b3 *inst/doc/longitudinal.R -2a041d7743427d8db336c55d199df886 *inst/doc/longitudinal.Rmd -b1744ecfd6611e5a8cf3af0603d0eca1 *inst/doc/longitudinal.html -511e07928b943bc9cca2e427c52d65ca *inst/doc/missing.R -1981dbfcbbe076ee038ad227c66b72e5 *inst/doc/missing.Rmd -5c3c5e5f7e72a0a71796299d8f217040 *inst/doc/missing.html -3fc54947ad99f028dd1339e2d99b43cd *inst/doc/ordinal.R -240fbaee7ade537a9cb378d6bc5fb193 *inst/doc/ordinal.Rmd -1bc7f8d0771a9aaf6df3dc2c7eb3e20c *inst/doc/ordinal.html -cd6189f165b144b478ab380015f8015a *inst/doc/simstudy.R -e5c3cc948999d1a03e9c7e2cf4f9ae42 *inst/doc/simstudy.Rmd -c7575dfa0e8a2336c122ee87c85d5d50 *inst/doc/simstudy.html -7fb1f079865d9b05d14040222b4b38b0 *inst/doc/spline.R -aaed7cd27bc967a5d18cdc23cab161a7 *inst/doc/spline.Rmd -5d9bfc39388628cdb5d3b485b50097e1 *inst/doc/spline.html -7da5d217d06d02b559f5b0f9282315ff *inst/doc/survival.R -12a6af36a4d5bd088bce0adb8ec7c7a6 *inst/doc/survival.Rmd -7a01153a2357789923558c9f8f7356af *inst/doc/survival.html -d8bc90687992b28e6a3f85936e21ddbd *inst/doc/treat_and_exposure.R -3350c862513e10e2a084f76715f12458 *inst/doc/treat_and_exposure.Rmd -c50e2cfc364c5bfe1ddab2da9556ff3f *inst/doc/treat_and_exposure.html +82e6a66acf4d128f59f4dbca792b52f1 *inst/doc/clustered.R +a7c1690aa805bef67443f91ce6af7fda *inst/doc/clustered.Rmd +c89893cfdbb94c6d58a6536c634814c8 *inst/doc/clustered.html +1fda968770cd092b31509b55145574d8 *inst/doc/corelationmat.R +9f12fa63b29be1db9f01ef4cf6ee59d2 *inst/doc/corelationmat.Rmd +5327f208e1227155084a66f87bf8d1d0 *inst/doc/corelationmat.html +567dba017dfce3050a3d7f6a6877432e *inst/doc/correlated.R +d2fb54e88cc53d7de715012983360304 *inst/doc/correlated.Rmd +beffa3b0b5f539223422b988b842b089 *inst/doc/correlated.html +960f2c6356514aa4096bd3ad3bb31bcd *inst/doc/double_dot_extension.R +a347d8e7f4f6037a2731967f416ebc64 *inst/doc/double_dot_extension.Rmd +93d6c50f6595c95562632711b395350d *inst/doc/double_dot_extension.html +a3f34ff207784d9d494c550879ef9825 *inst/doc/logisticCoefs.R +38d645ae895e8895115675910688da14 *inst/doc/logisticCoefs.Rmd +da7681f46b930b2918294eed7b7c5779 *inst/doc/logisticCoefs.html +c0340225cb49ea81c0fe7a152ae78a7a *inst/doc/longitudinal.R +943b1d35dbf0a819b9d4aca8c717092d *inst/doc/longitudinal.Rmd +a54fd3f0dbd1cdb4c32157a482689919 *inst/doc/longitudinal.html +5f4a6d4c92b21babeabc5d2cab6e2392 *inst/doc/missing.R +d84699592547359f6c966a2f030c4df1 *inst/doc/missing.Rmd +fc000f70853ae7271c2eb0397ae63f4f *inst/doc/missing.html +7d34a631e09da3b73696de2688021be6 *inst/doc/ordinal.R +79959e2396041ba2f9867a6da0b55af8 *inst/doc/ordinal.Rmd +097613c4d2fa04fe6e1e75f75f0750bc *inst/doc/ordinal.html +9d868713bbfaca29c45bfb3f8237ee3c *inst/doc/simstudy.R +0abcfc017e37b948fc0487001b5324fa *inst/doc/simstudy.Rmd +0400abf1fce6d523cc21963b494821e0 *inst/doc/simstudy.html +8d9b5edf2e86df6b9e7db25e44210b42 *inst/doc/spline.R +dc65c760f7496457cb4b9df927126b33 *inst/doc/spline.Rmd +e7af52c2c99aa4fc7a4cb3c150a12fb5 *inst/doc/spline.html +66b21648ed0b826de41a51ac3e987c37 *inst/doc/survival.R +151bd73a42b2ed0d7983f065ad64be8b *inst/doc/survival.Rmd +d4ccec14bb5b4fbd64f1628fa60baf67 *inst/doc/survival.html +e802a42f93bcb39c55f42d21709a7de8 *inst/doc/treat_and_exposure.R +bec20fc20eabe441e985ce9e78ab1fea *inst/doc/treat_and_exposure.Rmd +f5fad5b3767b5b65d8cd2f8b059916cf *inst/doc/treat_and_exposure.html 82ad2ba791ac3f301d31f693e7f0b192 *man/addColumns.Rd -b8fe8dbda053bab4de25a7982c902db6 *man/addCompRisk.Rd +ce3708a1c4688e44e2b1c84830a1b0c5 *man/addCompRisk.Rd ccc90c9d8bbad64478fb1ed165a890cd *man/addCondition.Rd 9f9ef16e4d0e673db29fd70df0a82c3b *man/addCorData.Rd f5b591e50e3c548f2ded3580830d3070 *man/addCorFlex.Rd -aa4290bb92888309a37758dcf7c63838 *man/addCorGen.Rd +4fce1621a4406b13a3e8c39a8126b928 *man/addCorGen.Rd 8264df0e86ea5d2b4fc51482ee3c3dec *man/addMarkov.Rd a39d95e37be81ab69dd0fdf2eed8c8ee *man/addMultiFac.Rd 5e2bdf680d956c0d9b0de87200fc84fd *man/addPeriods.Rd @@ -82,7 +85,7 @@ abf59c096fcd80225c446d9031f37c23 *man/defSurv.Rd 3fc5f5253687495283bbdb5e43387689 *man/genCatFormula.Rd a99f72af23d75cb5c4e4984fa9f6767c *man/genCluster.Rd 829b432bb6e30659fc351a5aeb94a16e *man/genCorData.Rd -e191bc171f4cac1b245f1b549671d43a *man/genCorFlex.Rd +6aa96601e4e2f10e92f05bfb553cdac8 *man/genCorFlex.Rd e437092a733c25b18519aae9a198b633 *man/genCorGen.Rd 6963d47b0f5d988f5352a0d0aaf9d26f *man/genCorMat.Rd 47b0d9c2c412f1ec2a1caa0b95b5eaaa *man/genCorOrdCat.Rd @@ -101,6 +104,7 @@ d6081f1b0a6c9e0b086f074d1495db3f *man/genMultiFac.Rd 9d3e53edbdd8e964fe5fb5cad6d16dc7 *man/genSurv.Rd 35336e46708eeaf40cfc5c5e9cbf6b7c *man/genSynthetic.Rd 5bf623d6ef0d0b3904d9b4afab5d9b91 *man/iccRE.Rd +641309c64ac0c60337cb86ec8740f2f9 *man/logisticCoefs.Rd 87a1909ef3202d4e679b6143aec90c40 *man/mergeData.Rd ebc958b81f5f032812f073422b952523 *man/negbinomGetSizeProb.Rd 92f0ab22b64230d5a326e751e0ca0c3a *man/simstudy-deprecated.Rd @@ -115,34 +119,35 @@ d011ad2c7578351b236592eb647bf0cd *man/trtStepWedge.Rd 7877937fcd60b9a021b4a116221e6bbe *man/updateDefAdd.Rd 088304c650398c7673fe4ad5fa63c12a *man/viewBasis.Rd 72e81e22e6c3c420376fc8f3619cc128 *man/viewSplines.Rd -b0584b1a995935a0d9b834b9848975aa *src/RcppExports.cpp -fcc9c7eafae2c55b8544c6da3b51d1f7 *src/srcRcpp.cpp -04ea6e7e596e981740c9e106967f40b4 *tests/testthat.R +f776fc930b68e04afbbe837b7523c765 *src/RcppExports.cpp +2b9e60ba542d4e40f7dcf280344c9791 *src/srcRcpp.cpp +6cb666e94d3c78146d2b61404fda504e *tests/testthat.R 42739c520a2d491d5d91b487d1fa35d5 *tests/testthat/helper-gen_def.R 2992fb1dcf628cc0cc5c1d9674497b7f *tests/testthat/setup-general.R 00461059e1127afcaf8bd19bf37f6136 *tests/testthat/teardown-general.R dd9a5240a2a099acbecbb1f9392efa95 *tests/testthat/test-actual-distributions.R -affc6368c549185ca35ac26692eeb2be *tests/testthat/test-add_data.R +c803bdcc61840bf4f042ec99fa5cb3ab *tests/testthat/test-add_data.R 0bed67b2a92bb8eb11a968ff2dc0df44 *tests/testthat/test-asserts.R -cc919f0958707a177a364a3ce4e90dbc *tests/testthat/test-conditions.R -2c6b4d670b0944da6c5dd6af49bb6d19 *tests/testthat/test-define_data.R +f8275b9cf492aa91e5d9ed29f56d0c81 *tests/testthat/test-conditions.R +97c953eb7904798c2813461ddc292e34 *tests/testthat/test-define_data.R a32649391332d185fada6e0634d1ba9d *tests/testthat/test-generate_correlated_data.R 8bbf6af48678ae4a531bd89a950bc676 *tests/testthat/test-generate_data.R 5b830c66b2d245ae07865ddacdca13fc *tests/testthat/test-generate_dist.R -6ae813fc5dbadc45c5f20e098e953df4 *tests/testthat/test-glue.R -783a9031a97e76dae1cb27e6db209ca4 *tests/testthat/test-group_data.R -61816b5f7f670d0fd7dce82b63a11641 *tests/testthat/test-internal_utility.R -2027ef487e4f99c3f40d8fe0b3ff8097 *tests/testthat/test-missing_data.R -37125167c88104b5aef1416fb2326a42 *tests/testthat/test-survival.R -c882be058535f2258fc91062645046bb *tests/testthat/test-utility.R -8aaa3531fbd5ad77a75d78401cf466f6 *vignettes/clustered.Rmd -9a8dd81ce7dc9733cc4c595b483e7a05 *vignettes/corelationmat.Rmd -adb2db9f7c4c38973f11cfac5765281e *vignettes/correlated.Rmd -78e0fd72c2c05cfe43c18890b86e2486 *vignettes/double_dot_extension.Rmd -2a041d7743427d8db336c55d199df886 *vignettes/longitudinal.Rmd -1981dbfcbbe076ee038ad227c66b72e5 *vignettes/missing.Rmd -240fbaee7ade537a9cb378d6bc5fb193 *vignettes/ordinal.Rmd -e5c3cc948999d1a03e9c7e2cf4f9ae42 *vignettes/simstudy.Rmd -aaed7cd27bc967a5d18cdc23cab161a7 *vignettes/spline.Rmd -12a6af36a4d5bd088bce0adb8ec7c7a6 *vignettes/survival.Rmd -3350c862513e10e2a084f76715f12458 *vignettes/treat_and_exposure.Rmd +9019dfbdd828d384a8445a7397d76b24 *tests/testthat/test-glue.R +5c5033cda5f23203154952e427d65c8a *tests/testthat/test-group_data.R +f628f16708a4a39c6298249f1fc2fed2 *tests/testthat/test-internal_utility.R +051d4be80b67ef225680d32b27f735da *tests/testthat/test-missing_data.R +6bdc91f8f440d9ae16877118b0afaf62 *tests/testthat/test-survival.R +b21dc531dcce6ecdacc4549b1f4c656d *tests/testthat/test-utility.R +a7c1690aa805bef67443f91ce6af7fda *vignettes/clustered.Rmd +9f12fa63b29be1db9f01ef4cf6ee59d2 *vignettes/corelationmat.Rmd +d2fb54e88cc53d7de715012983360304 *vignettes/correlated.Rmd +a347d8e7f4f6037a2731967f416ebc64 *vignettes/double_dot_extension.Rmd +38d645ae895e8895115675910688da14 *vignettes/logisticCoefs.Rmd +943b1d35dbf0a819b9d4aca8c717092d *vignettes/longitudinal.Rmd +d84699592547359f6c966a2f030c4df1 *vignettes/missing.Rmd +79959e2396041ba2f9867a6da0b55af8 *vignettes/ordinal.Rmd +0abcfc017e37b948fc0487001b5324fa *vignettes/simstudy.Rmd +dc65c760f7496457cb4b9df927126b33 *vignettes/spline.Rmd +151bd73a42b2ed0d7983f065ad64be8b *vignettes/survival.Rmd +bec20fc20eabe441e985ce9e78ab1fea *vignettes/treat_and_exposure.Rmd diff --git a/NAMESPACE b/NAMESPACE index 8bec3a9..05ecad4 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -48,6 +48,7 @@ export(genSpline) export(genSurv) export(genSynthetic) export(iccRE) +export(logisticCoefs) export(mergeData) export(negbinomGetSizeProb) export(survGetParams) @@ -64,6 +65,7 @@ import(data.table) import(glue) importFrom(Rcpp,evalCpp) importFrom(Rcpp,sourceCpp) +importFrom(fastglm,fastglm) importFrom(methods,is) useDynLib(simstudy) useDynLib(simstudy, .registration = TRUE) diff --git a/R/RcppExports.R b/R/RcppExports.R index 5a13df6..29fb3c7 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -29,3 +29,15 @@ getRhoMat <- function(N, P, TCORR) { .Call(`_simstudy_getRhoMat`, N, P, TCORR) } +getBeta0 <- function(lvec, popPrev, tolerance) { + .Call(`_simstudy_getBeta0`, lvec, popPrev, tolerance) +} + +estAUC <- function(dmatrix, y) { + .Call(`_simstudy_estAUC`, dmatrix, y) +} + +getBeta_auc <- function(covmat, coefs, auc, popPrev, tolerance) { + .Call(`_simstudy_getBeta_auc`, covmat, coefs, auc, popPrev, tolerance) +} + diff --git a/R/add_correlated_data.R b/R/add_correlated_data.R index a3e8b7e..ce17362 100644 --- a/R/add_correlated_data.R +++ b/R/add_correlated_data.R @@ -317,7 +317,6 @@ addCorFlex <- function(dt, defs, rho = 0, tau = NULL, corstr = "cs", #' # Long example #' #' def <- defData(varname = "xbase", formula = 5, variance = .4, dist = "gamma", id = "cid") -#' def <- defData(def, "nperiods", formula = 3, dist = "noZeroPoisson") #' #' def2 <- defDataAdd( #' varname = "p", formula = "-3+.2*period + .3*xbase", @@ -328,6 +327,11 @@ addCorFlex <- function(dt, defs, rho = 0, tau = NULL, corstr = "cs", #' #' dtLong <- addPeriods(dt, idvars = "cid", nPeriods = 3) #' dtLong <- addColumns(def2, dtLong) +#' +#' addCorGen( +#' dtOld = dtLong, idvar = "cid", nvars = NULL, rho = .7, corstr = "cs", +#' dist = "binary", param1 = "p" +#' ) #' #' @concept correlated #' @export @@ -468,7 +472,8 @@ addCorGen <- function(dtOld, nvars=NULL, idvar = "id", rho=NULL, corstr=NULL, co } dtTemp[, seq_ := 1:.N, keyby = .id] - + nvars <- dtTemp[.id == 1, .N] # only permits case where number of records per id is the same + #### if (method == "copula") { @@ -478,15 +483,18 @@ addCorGen <- function(dtOld, nvars=NULL, idvar = "id", rho=NULL, corstr=NULL, co dtM <- rbindlist( lapply(ns, function(x) .genQuantU(x$N, 1, rho, corstr, corMatrix[[x$.id]])) ) + dtTemp[, .U := dtM$Unew] } else { - ns <- as.list(dtTemp[, .N, keyby = .id][,N]) - dtM <- rbindlist( - lapply(ns, function(x) .genQuantU(x, 1, rho, corstr, corMatrix)) - ) + if (is.null(corMatrix)) { + corMatrix <- .buildCorMat(nvars, corMatrix = NULL, rho = rho, corstr = corstr) + } + ns <- nrow(dtTemp[, .N, keyby = .id]) + Unew <- c(t(mvnfast::rmvn(n = ns, mu = rep(0, nvars), sigma = corMatrix))) + + dtTemp[, .U := stats::pnorm(Unew)] } - dtTemp[, .U := dtM$Unew] - dtTemp[, seq := dtM$seq] + # dtTemp[, seq := dtM$seq] if (dist == "poisson") { setnames(dtTemp, param1, ".param1") diff --git a/R/generate_correlated_data.R b/R/generate_correlated_data.R index fd7cc37..5cf0fe0 100644 --- a/R/generate_correlated_data.R +++ b/R/generate_correlated_data.R @@ -97,6 +97,7 @@ genCorData <- function(n, mu, sigma, corMatrix = NULL, rho, corstr = "ind", #' is not specified. #' @return data.table with added column(s) of correlated data #' @examples +#' \dontrun{ #' def <- defData(varname = "xNorm", formula = 0, variance = 4, dist = "normal") #' def <- defData(def, varname = "xGamma1", formula = 15, variance = 2, dist = "gamma") #' def <- defData(def, varname = "xBin", formula = 0.5, dist = "binary") @@ -113,6 +114,7 @@ genCorData <- function(n, mu, sigma, corMatrix = NULL, rho, corstr = "ind", #' cor(dt[, -"id"], method = "kendall") #' var(dt[, -"id"]) #' apply(dt[, -"id"], 2, mean) +#' } #' @export #' @concept correlated genCorFlex <- function(n, defs, rho = 0, tau = NULL, corstr = "cs", corMatrix = NULL) { diff --git a/R/generate_dist.R b/R/generate_dist.R index 60c098e..5d67106 100644 --- a/R/generate_dist.R +++ b/R/generate_dist.R @@ -2,6 +2,7 @@ #' #' @useDynLib simstudy #' @importFrom Rcpp sourceCpp +#' @importFrom fastglm fastglm #' #' @param args One row from data definitions data.table #' @param n The number of observations required in the data set diff --git a/R/group_data.R b/R/group_data.R index cca0cb8..e2a5222 100644 --- a/R/group_data.R +++ b/R/group_data.R @@ -362,7 +362,7 @@ trtAssign <- function(dtName, nTrt = 2, balanced = TRUE, assertNotMissing(dtName = missing(dtName)) assertNotInDataTable(grpName, dtName) if (!is.null(ratio)) { - assertLengthEqual(var1 = length(ratio), var2 = nTrt) + assertEqual(nTrt = nTrt, val = length(ratio)) } dt <- copy(dtName) @@ -387,16 +387,14 @@ trtAssign <- function(dtName, nTrt = 2, balanced = TRUE, } else { # balanced is FALSE - strata are not relevant if (is.null(ratio)) { - if (nTrt == 2) { - formula <- .5 - } else { formula <- rep(1 / nTrt, nTrt) - } } else { # ratio not null formula <- round(ratio / sum(ratio), 8) } dt <- trtObserve(dt, formulas = formula, logit.link = FALSE, grpName) + if (nTrt == 2) dt[, (grpName) := get(grpName) - 1] + } return(dt[]) @@ -637,4 +635,4 @@ trtStepWedge <- function(dtName, clustID, nWaves, lenWaves, if (lag == 0) dd[, `:=`(xr = NULL)] return(dd[]) -} \ No newline at end of file +} diff --git a/R/internal_utility.R b/R/internal_utility.R index 0f88e1c..1dbf389 100644 --- a/R/internal_utility.R +++ b/R/internal_utility.R @@ -81,22 +81,28 @@ evalFormula <- function(formula) { e$formula2parse <- formula - - res <- with(e, { - expr <- parse(text = as.character(formula2parse)) - tryCatch( - expr = dtSim[, newVar := eval(expr), keyby = def_id], - error = function(err) { - if (grepl("RHS length must either be 1", gettext(err), fixed = T)) { - dtSim[, newVar := eval(expr)] - } else { - stop(gettext(err)) - } - } - ) - copy(dtSim$newVar) - }) - + + if ( grepl("%.*%|\\[.*,.*\\]", formula) ) { + res <- with(e, { + expr <- parse(text = as.character(formula2parse)) + tryCatch( + expr = dtSim[, newVar := eval(expr), keyby = def_id], + error = function(err) stop(gettext(err)) + ) + copy(dtSim$newVar) + + }) + } else { + res <- with(e, { + expr <- parse(text = as.character(formula2parse)) + tryCatch( + expr = dtSim[, newVar := eval(expr)], + error = function(err) stop(gettext(err)) + ) + copy(dtSim$newVar) + }) + } + if (length(res) == 1) { rep(res, n) } else { diff --git a/R/utility.R b/R/utility.R index 52eba3d..493ed06 100644 --- a/R/utility.R +++ b/R/utility.R @@ -884,7 +884,7 @@ survParamPlot <- function(formula, shape, points = NULL, n = 100, scale = 1, } -#' Generating single competing risk survival varible +#' Generating single competing risk survival variable #' #' @param dtName Name of complete data set to be updated #' @param events Vector of column names that include @@ -971,5 +971,164 @@ addCompRisk <- function(dtName, events, timeName, setnames(dtSurv, "id", idName) dtSurv[] } + +#' Determine intercept, treatment/exposure and covariate coefficients that can +#' be used for binary data generation with a logit link and a set of covariates +#' @description This is an implementation of an iterative bisection procedure +#' that can be used to determine coefficient values for a target population +#' prevalence as well as a target risk ratio, risk difference, or AUC. These +#' coefficients can be used in a subsequent data generation process to simulate +#' data with these desire characteristics. +#' @param defCovar A definition table for the covariates in the underlying +#' population. This tables specifies the distribution of the covariates. +#' @param coefs A vector of coefficients that reflect the relationship between +#' each of the covariates and the log-odds of the outcome. +#' @param popPrev The target population prevalence of the outcome. +#' A value between 0 and 1. +#' @param rr The target risk ratio, which must be a value between 0 and +#' 1/popPrev. Defaults to NULL. +#' @param rd The target risk difference, which must be between +#' -(popPrev) and (1 - popPrev). Defaults to NULL +#' @param auc The target AUC, which must be a value between 0.5 and 1.0 . +#' Defaults to NULL. +#' @param tolerance The minimum stopping distance between the adjusted low and high +#' endpoints. Defaults to 0.001. +#' @param sampleSize The number of units to generate for the bisection algorithm. +#' The default is 1e+05. To get a reliable estimate, the value +#' should be no smaller than the default, though larger values can be used, though +#' computing time will increase. +#' @param trtName If either a risk ratio or risk difference is the target statistic, +#' a treatment/exposure variable name can be provided. Defaults to "A". +#' @details If no specific target statistic is specified, then only the intercept +#' is returned along with the original coefficients. Only one target statistic (risk ratio, risk +#' difference or AUC) can be specified with a single function call; in all three cases, a target +#' prevalence is still required. +#' @references Austin, Peter C. "The iterative bisection procedure: a useful +#' tool for determining parameter values in data-generating processes in +#' Monte Carlo simulations." BMC Medical Research Methodology 23, +#' no. 1 (2023): 1-10. +#' @return A vector of parameters including the intercept and covariate +#' coefficients for the logistic model data generating process. +#' @examples +#' \dontrun{ +#' d1 <- defData(varname = "x1", formula = 0, variance = 1) +#' d1 <- defData(d1, varname = "b1", formula = 0.5, dist = "binary") +#' +#' coefs <- log(c(1.2, 0.8)) +#' +#' logisticCoefs(d1, coefs, popPrev = 0.20) +#' logisticCoefs(d1, coefs, popPrev = 0.20, rr = 1.50, trtName = "rx") +#' logisticCoefs(d1, coefs, popPrev = 0.20, rd = 0.30, trtName = "rx") +#' logisticCoefs(d1, coefs, popPrev = 0.20, auc = 0.80) +#' } +#' @export +#' @concept utility +#' +logisticCoefs <- function(defCovar, coefs, popPrev, rr = NULL, rd = NULL, + auc = NULL, tolerance = 0.001, sampleSize = 1e+05, trtName = "A") { + + ### "initialize" variables + + varname <- NULL + y <- NULL + py <- NULL + + ### + + num_notNull <- sum(sapply(list(rr, rd, auc), function(x) !is.null(x))) + + if ( num_notNull > 1) { + stop("Specify only one target statistic") + } + + assertNotMissing(popPrev = missing(popPrev), + defCovar = missing(defCovar), + coefs = missing(coefs)) + + assertLength(coefs = coefs, length = nrow(defCovar)) + + if (!is.null(rr)) { + assertAtLeast(rr = rr, minVal = 0) + if (rr > 1/popPrev) { + stop(paste("rr is", rr, "but must be at most", formatC(1/popPrev, digits = 3), "(1/popPrev)")) + } + } + + if (!is.null(rd)) { + assertInRange(rd = rd, range = c(-popPrev, 1-popPrev)) + } + + if (!is.null(auc)) { + assertInRange(auc = auc, range = c(0.5, 1)) + } + + assertInRange(popPrev = popPrev, range = c(0, 1)) + assertNumeric(coefs = coefs) + + #### + + if (num_notNull == 0) targetStat <- "prev" + if (!is.null(rr)) targetStat <- "rr" + if (!is.null(rd)) targetStat <- "rd" + if (!is.null(auc)) targetStat <- "auc" + dd <- genData(sampleSize, defCovar) + if (targetStat == "prev") { + + B0 <- getBeta0(lvec = as.matrix(dd[, -1]) %*% coefs, popPrev, tolerance) + B <- c(B0, coefs) + names(B) <- c("B0", defCovar$varname) + + } else if (targetStat %in% c("rr", "rd")) { + + if (targetStat == "rr") { + statValue <- rr + } else { + statValue <- rd + } + + B0 <- getBeta0(lvec = as.matrix(dd[, -1]) %*% coefs, popPrev, tolerance) + + lvec <- cbind(1, as.matrix(dd[, -1])) %*% c(B0, coefs) + + intLow <- -10 + intHigh <- 10 + + while(abs(intHigh - intLow) > tolerance){ + + Delta <- (intLow + intHigh)/2 + + if (targetStat == "rr") { + rStat <- mean(stats::plogis(lvec + Delta)) / mean(stats::plogis(lvec)) + } else { + rStat <- mean(stats::plogis(lvec + Delta)) - mean(stats::plogis(lvec)) + } + + if (rStat < statValue) { + intLow <- Delta + } else { + intHigh <- Delta + } + + } + + Delta <- (intLow + intHigh)/2 + + B <- c(B0, Delta, coefs) + names(B) <- c("B0", trtName, defCovar$varname) + + } else if (targetStat == "auc") { + + dmatrix <- as.matrix(dd[, -1]) + results <- getBeta_auc(dmatrix, coefs, auc = auc, popPrev = popPrev, tolerance = tolerance) + + B <- c(results[1], results[2]*coefs) + names(B) <- c("B0", defCovar$varname) + + } + + return(B) + +} + diff --git a/build/vignette.rds b/build/vignette.rds index 71d3e5aca435f7257ec552bcf5f1a4842747c7c3..e22574bdd185268576a591c1bc7f9907975aee47 100644 GIT binary patch literal 527 zcmV+q0`UDGiwFP!000001HDwaa@#NvrHJS1Q|E6#+k@PIt@ZV3T8k+ zLx6VF`J+=uS&>*|!BLwO0Lk}u7t6Y+rc$#C$KJIO_O0Vg6phM zwRNHreEzlM9=s|{B@}bg=#n`?oCDD?QTQD<^>-y!+_;ta>4dhBNXsd+k=Dr~Gh%HC zajA^nNLO>InIib4l-5eUp^&M%QBx~R3&}fzjqO%REuyHpdXX z(PqTboM$Lanlr~d`XTcYz&oZ3<|$!_8$U{?Z^POg;EE+7$#ae24NcZ&pqn9UL z7%o)0lx5x4NBtg&OtkQH(dtC8yQfyDG=nxa2HJ~rK4N)DTOQP-qo(x z=YxQy&@KfL(@$Q+u7H1M>xV0qN8iW8MVJdRT?D>AI$pG5JYR&liUy1jQ=Bl0;fN7z z7G{iq6F+2898MVlZ;y;wL&q{rhy{rf)PUF_wI6!K_hpp$|`5pKj{t7L5;=$pAR00E@r3(*( zFZqMN?V)O?x!4NL@y~MhS?pfnYuL^~qsqL_@vs@J{u^xj{r*?Cqe3x@9U(a8VCng~ Rf_!-T{Rh~7aQQI?000=N0^?(GE z-uP**lx^ZQ>#A1{c9Pkh*gKke?nP15iaKrBVMXl|*mhtY!iu9#)P?6Dla;Z&5@xkg;oW#q_7W#x)QWs=DK>3e41-J<)4--14L3klv>S%t#YB5BzRm1V}zPf zNSEtfeIp73vJ!_D<4LU!g$5HR4+YPkR{9iN*#tC7uAN5f`usWM%@)$3$Du_?<$!pE zz*iehLK=<&mykjqZ9*~h3Bd;5C5i;8p zcO2Lm3(2wJNm&2Uen`jebqiJ6P5GRw9CN>S!xtZ*zF9k{Tk4M?9yWm0`hsn_Tz;?L dD3i?KjS%eRkjUX{34Hka`30gbr;xS<002$K \usepackage[utf8]{inputenc} --- +```{r chunkname, echo=-1} +data.table::setDTthreads(2) +``` + ```{r, echo = FALSE, message = FALSE} library(simstudy) library(ggplot2) diff --git a/inst/doc/clustered.html b/inst/doc/clustered.html index 33c683a..39fd9fe 100644 --- a/inst/doc/clustered.html +++ b/inst/doc/clustered.html @@ -356,16 +356,16 @@

Clustered Data

classrooms, so the simulation provides random effects at each of these levels.

We start by defining the school level data:

-
gen.school <- defData(varname = "s0", dist = "normal", formula = 0, variance = 3,
-    id = "idSchool")
-gen.school <- defData(gen.school, varname = "nClasses", dist = "noZeroPoisson", formula = 3)
-
-set.seed(282721)
-
-dtSchool <- genData(8, gen.school)
-dtSchool <- trtAssign(dtSchool, n = 2)
-
-dtSchool
+
gen.school <- defData(varname = "s0", dist = "normal", formula = 0, variance = 3,
+    id = "idSchool")
+gen.school <- defData(gen.school, varname = "nClasses", dist = "noZeroPoisson", formula = 3)
+
+set.seed(282721)
+
+dtSchool <- genData(8, gen.school)
+dtSchool <- trtAssign(dtSchool, n = 2)
+
+dtSchool
##    idSchool         s0 nClasses trtGrp
 ## 1:        1  0.9732297        3      1
 ## 2:        2  3.5741932        4      1
@@ -378,14 +378,14 @@ 

Clustered Data

The classroom level data are generated with a call to genCluster, and then school level data is added by a call to addColumns:

-
gen.class <- defDataAdd(varname = "c0", dist = "normal", formula = 0, variance = 2)
-gen.class <- defDataAdd(gen.class, varname = "nStudents", dist = "noZeroPoisson",
-    formula = 20)
-
-dtClass <- genCluster(dtSchool, "idSchool", numIndsVar = "nClasses", level1ID = "idClass")
-dtClass <- addColumns(gen.class, dtClass)
-
-head(dtClass, 10)
+
gen.class <- defDataAdd(varname = "c0", dist = "normal", formula = 0, variance = 2)
+gen.class <- defDataAdd(gen.class, varname = "nStudents", dist = "noZeroPoisson",
+    formula = 20)
+
+dtClass <- genCluster(dtSchool, "idSchool", numIndsVar = "nClasses", level1ID = "idClass")
+dtClass <- addColumns(gen.class, dtClass)
+
+head(dtClass, 10)
##     idSchool        s0 nClasses trtGrp idClass          c0 nStudents
 ##  1:        1 0.9732297        3      1       1  1.62726560        16
 ##  2:        1 0.9732297        3      1       2 -0.69640102        16
@@ -398,16 +398,16 @@ 

Clustered Data

## 9: 3 0.1121028 3 0 9 0.07024057 19 ## 10: 3 0.1121028 3 0 10 1.09465368 21

Finally, the student level data are added using the same process:

-
gen.student <- defDataAdd(varname = "Male", dist = "binary",
-    formula = 0.5)
-gen.student <- defDataAdd(gen.student, varname = "age", dist = "uniform",
-    formula = "9.5; 10.5")
-gen.student <- defDataAdd(gen.student, varname = "test", dist = "normal",
-    formula = "50 - 5*Male + s0 + c0 + 8 * trtGrp", variance = 2)
-dtStudent <- genCluster(dtClass, cLevelVar = "idClass", numIndsVar = "nStudents",
-    level1ID = "idChild")
-
-dtStudent <- addColumns(gen.student, dtStudent)
+
gen.student <- defDataAdd(varname = "Male", dist = "binary",
+    formula = 0.5)
+gen.student <- defDataAdd(gen.student, varname = "age", dist = "uniform",
+    formula = "9.5; 10.5")
+gen.student <- defDataAdd(gen.student, varname = "test", dist = "normal",
+    formula = "50 - 5*Male + s0 + c0 + 8 * trtGrp", variance = 2)
+dtStudent <- genCluster(dtClass, cLevelVar = "idClass", numIndsVar = "nStudents",
+    level1ID = "idChild")
+
+dtStudent <- addColumns(gen.student, dtStudent)

This is what the clustered data look like. Each classroom is represented by a box, and each school is represented by a color. The intervention group is highlighted by dark outlines:

@@ -429,9 +429,9 @@

Setting cluster sizes

0 (the default) implies exact balance, and larger values of dispersion imply more variability in the cluster sizes.

Here are two examples with exact or close to exact balance:

-
d1 <- defData(varname = "clustSize", formula = 120, dist = "clusterSize")
-
-genData(8, d1, id = "site")
+
d1 <- defData(varname = "clustSize", formula = 120, dist = "clusterSize")
+
+genData(8, d1, id = "site")
##    site clustSize
 ## 1:    1        15
 ## 2:    2        15
@@ -441,7 +441,7 @@ 

Setting cluster sizes

## 6: 6 15 ## 7: 7 15 ## 8: 8 15
-
genData(7, d1, id = "site")
+
genData(7, d1, id = "site")
##    site clustSize
 ## 1:    1        17
 ## 2:    2        17
@@ -451,10 +451,10 @@ 

Setting cluster sizes

## 6: 6 18 ## 7: 7 17

This is a second example with variability across sites:

-
d1 <- defData(varname = "clustSize", formula = 120, 
-  variance = .1, dist = "clusterSize")
-
-genData(8, d1, id = "site")
+
d1 <- defData(varname = "clustSize", formula = 120, 
+  variance = .1, dist = "clusterSize")
+
+genData(8, d1, id = "site")
##    site clustSize
 ## 1:    1        22
 ## 2:    2        14
diff --git a/inst/doc/corelationmat.R b/inst/doc/corelationmat.R
index 3981185..6406a4a 100644
--- a/inst/doc/corelationmat.R
+++ b/inst/doc/corelationmat.R
@@ -1,4 +1,7 @@
-## ---- echo = FALSE, message = FALSE-------------------------------------------
+## ----chunkname, echo=-1-------------------------------------------------------
+data.table::setDTthreads(2)
+
+## ----echo = FALSE, message = FALSE--------------------------------------------
 library(simstudy)
 library(ggplot2)
 library(scales)
@@ -28,14 +31,14 @@ ggtheme <- function(panelback = "white") {
   
 }
 
-## ---- message=FALSE-----------------------------------------------------------
+## ----message=FALSE------------------------------------------------------------
 library(simstudy)
 library(data.table)
 set.seed(37265)
 
 genCorMat(4)
 
-## ---- message=FALSE-----------------------------------------------------------
+## ----message=FALSE------------------------------------------------------------
 R <- genCorMat(4, cors = c(0.6, 0.4, 0.2, 0.5, 0.3, 0.8))
 R
 
@@ -48,7 +51,7 @@ head(dd)
 ## -----------------------------------------------------------------------------
 round(cor(as.matrix(dd[, -1])), 1)
 
-## ---- echo=FALSE--------------------------------------------------------------
+## ----echo=FALSE---------------------------------------------------------------
 matform <- "
 R = \\left (
 \\begin{matrix} 
@@ -65,7 +68,7 @@ katex::katex_html(matform, include_css = TRUE)
 ## -----------------------------------------------------------------------------
 genCorMat(nvars = 4, rho = 0.6, corstr = "cs")
 
-## ---- echo=FALSE--------------------------------------------------------------
+## ----echo=FALSE---------------------------------------------------------------
 matform <- "
 R = \\left (
 \\begin{matrix} 
@@ -82,7 +85,7 @@ katex::katex_html(matform)
 ## -----------------------------------------------------------------------------
 genCorMat(nvars = 4, rho = 0.6, corstr = "ar1")
 
-## ---- echo=FALSE--------------------------------------------------------------
+## ----echo=FALSE---------------------------------------------------------------
 matform <- "\\footnotesize{
 
 R = \\left ( \\begin{array}{c|c|c|c}
@@ -212,7 +215,7 @@ dd <- addCorGen(dc, idvar = "site", param1 = "lambda", corMatrix = RC,
 
 dd
 
-## ---- eval=FALSE--------------------------------------------------------------
+## ----eval=FALSE---------------------------------------------------------------
 #  replicate <- function(R, dc) {
 #    reps <- lapply(1:5000, function(x)
 #    addCorGen(dc, idvar = "site", param1 = "lambda", corMatrix = R,
@@ -228,7 +231,7 @@ dd
 #  replicate(R = RC, dc = dc)
 #  
 
-## ---- eval=TRUE---------------------------------------------------------------
+## ----eval=TRUE----------------------------------------------------------------
 d1 <- defData(varname = "n", formula = 20, dist = "noZeroPoisson")
 d1 <- defData(d1, varname = "mu", formula = 10, variance = 8, dist = "normal")
 d1 <- defData(d1, varname = "s2", formula = 4, dist = "nonrandom")
@@ -252,7 +255,7 @@ dd <- addCorGen(dc, idvar = "site", param1 = "mu", param2 = "s2",
 
 dd
 
-## ---- echo=FALSE--------------------------------------------------------------
+## ----echo=FALSE---------------------------------------------------------------
 library(katex)
 matform <- "\\footnotesize{
 
@@ -311,7 +314,7 @@ R = \\left ( \\begin{array}{c|c|c}
 
 katex_html(matform)
 
-## ---- echo=FALSE--------------------------------------------------------------
+## ----echo=FALSE---------------------------------------------------------------
 matform <- "\\footnotesize{
 
 R = \\left ( \\begin{array}{c|c|c}
@@ -369,7 +372,7 @@ R = \\left ( \\begin{array}{c|c|c}
 
 katex_html(matform)
 
-## ---- echo=FALSE--------------------------------------------------------------
+## ----echo=FALSE---------------------------------------------------------------
 matform <- "\\footnotesize{
 
 R = \\left ( \\begin{array}{c|c|c}
@@ -427,7 +430,7 @@ R = \\left ( \\begin{array}{c|c|c}
 
 katex_html(matform)
 
-## ---- echo=FALSE--------------------------------------------------------------
+## ----echo=FALSE---------------------------------------------------------------
 matform <- "\\footnotesize{
 
 R = \\left ( \\begin{array}{c|c|c}
@@ -509,7 +512,7 @@ R_XD
 dd <- genCorGen(n = 5000, nvars = 6, corMatrix = R_XD,
   dist = "poisson", params1 = 7, wide = TRUE)
 
-## ---- echo=FALSE--------------------------------------------------------------
+## ----echo=FALSE---------------------------------------------------------------
 round(cor(as.matrix(dd[, -1])), 2)
 
 ## -----------------------------------------------------------------------------
@@ -521,7 +524,7 @@ R_CE
 dd <- genCorGen(n = 5000, nvars = 6, corMatrix = R_CE,
   dist = "poisson", params1 = 7, wide = TRUE)
 
-## ---- echo=FALSE--------------------------------------------------------------
+## ----echo=FALSE---------------------------------------------------------------
 round(cor(as.matrix(dd[, -1])), 2)
 
 ## -----------------------------------------------------------------------------
@@ -533,7 +536,7 @@ R_CD
 dd <- genCorGen(n = 5000, nvars = 6, corMatrix = R_CD,
   dist = "poisson", params1 = 7, wide = TRUE)
 
-## ---- echo=FALSE--------------------------------------------------------------
+## ----echo=FALSE---------------------------------------------------------------
 round(cor(as.matrix(dd[, -1])), 2)
 
 ## -----------------------------------------------------------------------------
@@ -563,7 +566,7 @@ R <- blockDecayMat(ninds = N , nperiods = 3, rho_w = 0.6, r = r, nclusters = 10)
 
 lapply(R, function(x) round(x,2))[c(1, 3, 7)]
 
-## ---- eval=FALSE--------------------------------------------------------------
+## ----eval=FALSE---------------------------------------------------------------
 #  reps <- lapply(1:5000,
 #    function(x) addCorGen(dd, idvar = "site", corMatrix = R,
 #      dist = "poisson", param1 = "lambda", cnames = "y")
diff --git a/inst/doc/corelationmat.Rmd b/inst/doc/corelationmat.Rmd
index 816af70..60f2e07 100644
--- a/inst/doc/corelationmat.Rmd
+++ b/inst/doc/corelationmat.Rmd
@@ -7,6 +7,9 @@ vignette: >
   \usepackage[utf8]{inputenc}
 ---
 
+```{r chunkname, echo=-1}
+data.table::setDTthreads(2)
+```
 
 ```{r, echo = FALSE, message = FALSE}
 library(simstudy)
diff --git a/inst/doc/corelationmat.html b/inst/doc/corelationmat.html
index 558eab7..6e60d2e 100644
--- a/inst/doc/corelationmat.html
+++ b/inst/doc/corelationmat.html
@@ -356,11 +356,11 @@ 

Simple correlation matrix generation

correlation matrix with a set of specified coefficients.

Here is an example of the first, a randomly generated correlation matrix:

-
library(simstudy)
-library(data.table)
-set.seed(37265)
-
-genCorMat(4)
+
library(simstudy)
+library(data.table)
+set.seed(37265)
+
+genCorMat(4)
##             [,1]        [,2]        [,3]       [,4]
 ## [1,]  1.00000000 -0.22742403  0.01285282 -0.3201579
 ## [2,] -0.22742403  1.00000000 -0.04973973 -0.1218070
@@ -370,8 +370,8 @@ 

Simple correlation matrix generation

well get an error message if it is not positive semidefinite!). These coefficients define the lower triangle (and upper) triangle of the matrix, reading down the columns:

-
R <- genCorMat(4, cors = c(0.6, 0.4, 0.2, 0.5, 0.3, 0.8))
-R
+
R <- genCorMat(4, cors = c(0.6, 0.4, 0.2, 0.5, 0.3, 0.8))
+R
##      [,1] [,2] [,3] [,4]
 ## [1,]  1.0  0.6  0.4  0.2
 ## [2,]  0.6  1.0  0.5  0.3
@@ -379,10 +379,10 @@ 

Simple correlation matrix generation

## [4,] 0.2 0.3 0.8 1.0

This matrix can be used to generate data using functions genCorData or genCorGen:

-
dd <- genCorGen(n = 1000, nvars = 4, corMatrix = R, params1 = c(3, 5, 8, 9), 
-  dist = "poisson", wide = TRUE)
-
-head(dd)
+
dd <- genCorGen(n = 1000, nvars = 4, corMatrix = R, params1 = c(3, 5, 8, 9), 
+  dist = "poisson", wide = TRUE)
+
+head(dd)
##    id V1 V2 V3 V4
 ## 1:  1  3  3  5  6
 ## 2:  2  3  9 12 10
@@ -392,7 +392,7 @@ 

Simple correlation matrix generation

## 6: 6 4 5 6 7

And the correlation from this data set is quite close to the specified matrix R.

-
round(cor(as.matrix(dd[, -1])), 1)
+
round(cor(as.matrix(dd[, -1])), 1)
##     V1  V2  V3  V4
 ## V1 1.0 0.6 0.4 0.2
 ## V2 0.6 1.0 0.5 0.3
@@ -429,7 +429,7 @@ 

Specifying a structure

c242.7,-294.7,395.3,-681.7,458,-1161c21.3,-164.7,33.3,-350.7,36,-558 l0,-1344c-2,-159.3,-10,-310.7,-24,-454c-53.3,-528,-210,-949.7, -470,-1265c-4.7,-6,-9.7,-11.7,-15,-17c-0.7,-0.7,-6.7,-1,-18,-1z"> -
genCorMat(nvars = 4, rho = 0.6, corstr = "cs")
+
genCorMat(nvars = 4, rho = 0.6, corstr = "cs")
##      [,1] [,2] [,3] [,4]
 ## [1,]  1.0  0.6  0.6  0.6
 ## [2,]  0.6  1.0  0.6  0.6
@@ -462,7 +462,7 @@ 

Specifying a structure

c242.7,-294.7,395.3,-681.7,458,-1161c21.3,-164.7,33.3,-350.7,36,-558 l0,-1344c-2,-159.3,-10,-310.7,-24,-454c-53.3,-528,-210,-949.7, -470,-1265c-4.7,-6,-9.7,-11.7,-15,-17c-0.7,-0.7,-6.7,-1,-18,-1z"> -
genCorMat(nvars = 4, rho = 0.6, corstr = "ar1")
+
genCorMat(nvars = 4, rho = 0.6, corstr = "ar1")
##       [,1] [,2] [,3]  [,4]
 ## [1,] 1.000 0.60 0.36 0.216
 ## [2,] 0.600 1.00 0.60 0.360
@@ -619,10 +619,10 @@ 

Cluster-specific correlation matrices

clusters (though either or both can be scalars, in which case the values are shared across the clusters). The output is a list of correlation matrices, one for each cluster.

-
RC <- genCorMat(nvars = c(2, 3, 4, 3), rho = c(0.6, 0.7, 0.5, 0.4), 
-  corstr = "cs", nclusters = 4)
-
-RC
+
RC <- genCorMat(nvars = c(2, 3, 4, 3), rho = c(0.6, 0.7, 0.5, 0.4), 
+  corstr = "cs", nclusters = 4)
+
+RC
## $`1`
 ##      [,1] [,2]
 ## [1,]  1.0  0.6
@@ -649,17 +649,17 @@ 

Cluster-specific correlation matrices

To create the correlated data, first we can generate a data set of individuals that are clustered in groups. The outcome will be Poisson distributed, so we are specifying mean \(\lambda\) for each cluster:

-
d1 <- defData(varname = "n", formula = "c(2, 3, 4, 3)", dist = "nonrandom")
-d1 <- defData(d1, varname = "lambda", formula = "c(6, 7, 9, 8)", dist = "nonrandom")
-
-ds <- genData(4, d1, id = "site")
-dc <- genCluster(dtClust = ds, cLevelVar = "site", numIndsVar = "n", "id")
+
d1 <- defData(varname = "n", formula = "c(2, 3, 4, 3)", dist = "nonrandom")
+d1 <- defData(d1, varname = "lambda", formula = "c(6, 7, 9, 8)", dist = "nonrandom")
+
+ds <- genData(4, d1, id = "site")
+dc <- genCluster(dtClust = ds, cLevelVar = "site", numIndsVar = "n", "id")

Now, we can generate data using the correlation matrix RC:

-
dd <- addCorGen(dc, idvar = "site", param1 = "lambda", corMatrix = RC,
-          dist = "poisson", cnames = "y", method = "copula")
-
-dd
+
dd <- addCorGen(dc, idvar = "site", param1 = "lambda", corMatrix = RC,
+          dist = "poisson", cnames = "y", method = "copula")
+
+dd
##     site n lambda id  y
 ##  1:    1 2      6  1 11
 ##  2:    1 2      6  2  7
@@ -676,19 +676,19 @@ 

Cluster-specific correlation matrices

If we want to confirm that everything is working as expected, we can recover the overall correlation matrix by generating a large number of data sets (in this case 5000):

-
replicate <- function(R, dc) {
-  reps <- lapply(1:5000, function(x)
-  addCorGen(dc, idvar = "site", param1 = "lambda", corMatrix = R,
-    dist = "poisson", cnames = "y", method = "copula")
-  )
-
-  drep <- data.table::rbindlist(reps, idcol = "rep")
-  drep[, seq := 1:.N, keyby = rep]
-  dmat <- as.matrix(dcast(drep, rep ~ seq, value.var = "y")[, -1])
-  round(cor(dmat), 1) 
-}
-
-replicate(R = RC, dc = dc)
+
replicate <- function(R, dc) {
+  reps <- lapply(1:5000, function(x)
+  addCorGen(dc, idvar = "site", param1 = "lambda", corMatrix = R,
+    dist = "poisson", cnames = "y", method = "copula")
+  )
+
+  drep <- data.table::rbindlist(reps, idcol = "rep")
+  drep[, seq := 1:.N, keyby = rep]
+  dmat <- as.matrix(dcast(drep, rep ~ seq, value.var = "y")[, -1])
+  round(cor(dmat), 1) 
+}
+
+replicate(R = RC, dc = dc)

It seems to have worked quite well - the empirical matrix matches the hypothetical matrix above. In the next post, I’ll describe how block matrices for different clusters over different time periods can also be @@ -700,21 +700,21 @@

More elaborate example

coefficients) themselves are randomly generated. By providing this flexibility, we induce extra variability in the data generation process.

-
d1 <- defData(varname = "n", formula = 20, dist = "noZeroPoisson")
-d1 <- defData(d1, varname = "mu", formula = 10, variance = 8, dist = "normal")
-d1 <- defData(d1, varname = "s2", formula = 4, dist = "nonrandom")
-
-ds <- genData(100, d1, id = "site")
-dc <- genCluster(dtClust = ds, cLevelVar = "site", numIndsVar = "n", "id")
-
-n <- dc[, .N, keyby = site][, N]
-nsites <- length(n)
-rho <- rbeta(nsites, 25, 15)
-
-RM <- genCorMat(nvars = n, rho = rho, corstr = "cs", nclusters = nsites)
+
d1 <- defData(varname = "n", formula = 20, dist = "noZeroPoisson")
+d1 <- defData(d1, varname = "mu", formula = 10, variance = 8, dist = "normal")
+d1 <- defData(d1, varname = "s2", formula = 4, dist = "nonrandom")
+
+ds <- genData(100, d1, id = "site")
+dc <- genCluster(dtClust = ds, cLevelVar = "site", numIndsVar = "n", "id")
+
+n <- dc[, .N, keyby = site][, N]
+nsites <- length(n)
+rho <- rbeta(nsites, 25, 15)
+
+RM <- genCorMat(nvars = n, rho = rho, corstr = "cs", nclusters = nsites)

Here are the first three rows and columns of the correlation matrices for three clusters, as well as the dimensions for each matrix.

-
lapply(RM[c(1, 38, 97)], function(x) x[1:3, 1:3])
+
lapply(RM[c(1, 38, 97)], function(x) x[1:3, 1:3])
## $`1`
 ##           [,1]      [,2]      [,3]
 ## [1,] 1.0000000 0.6155452 0.6155452
@@ -732,7 +732,7 @@ 

More elaborate example

## [1,] 1.0000000 0.6292113 0.6292113 ## [2,] 0.6292113 1.0000000 0.6292113 ## [3,] 0.6292113 0.6292113 1.0000000
-
lapply(RM[c(1, 38, 97)], function(x) dim(x))
+
lapply(RM[c(1, 38, 97)], function(x) dim(x))
## $`1`
 ## [1] 20 20
 ## 
@@ -742,10 +742,10 @@ 

More elaborate example

## $`97` ## [1] 24 24

And here is how we generate the data

-
dd <- addCorGen(dc, idvar = "site", param1 = "mu", param2 = "s2",
-                corMatrix = RM, dist = "normal", cnames = "y", method = "copula")
-
-dd
+
dd <- addCorGen(dc, idvar = "site", param1 = "mu", param2 = "s2",
+                corMatrix = RM, dist = "normal", cnames = "y", method = "copula")
+
+dd
##       site  n        mu s2   id         y
 ##    1:    1 20  6.141913  4    1  5.104369
 ##    2:    1 20  6.141913  4    2  7.747311
@@ -1138,13 +1138,13 @@ 

Generating block matrices and simulating data

Cross-sectional data with exchangeable correlation

In the first example, we specify \(\rho_w = 0.5\) and \(\rho_b = 0.3\):

-
library(simstudy)
-library(data.table)
-
-R_XE <- blockExchangeMat(ninds = 2 , nperiods = 3, rho_w = 0.5,
-  rho_b = 0.3, pattern = "xsection")
-
-R_XE
+
library(simstudy)
+library(data.table)
+
+R_XE <- blockExchangeMat(ninds = 2 , nperiods = 3, rho_w = 0.5,
+  rho_b = 0.3, pattern = "xsection")
+
+R_XE
##      [,1] [,2] [,3] [,4] [,5] [,6]
 ## [1,]  1.0  0.5  0.3  0.3  0.3  0.3
 ## [2,]  0.5  1.0  0.3  0.3  0.3  0.3
@@ -1157,10 +1157,10 @@ 

Cross-sectional data with exchangeable correlation

effectively generating 5000 data sets with 6 observations each, all based on a Poisson distribution with mean = 7. I then report the empirical correlation matrix.

-
dd <- genCorGen(n = 5000, nvars = 6, corMatrix = R_XE,
-  dist = "poisson", params1 = 7, wide = TRUE)
-
-round(cor(as.matrix(dd[, -1])), 2)
+
dd <- genCorGen(n = 5000, nvars = 6, corMatrix = R_XE,
+  dist = "poisson", params1 = 7, wide = TRUE)
+
+round(cor(as.matrix(dd[, -1])), 2)
##      V1   V2   V3   V4   V5   V6
 ## V1 1.00 0.48 0.29 0.29 0.29 0.30
 ## V2 0.48 1.00 0.25 0.28 0.29 0.28
@@ -1173,10 +1173,10 @@ 

Cross-sectional data with exchangeable correlation

Cross-sectional data with correlation decay

Here, there is a decay parameter \(r = 0.8\) and no parameter \(\rho_b\).

-
R_XD <- blockDecayMat(ninds = 2 , nperiods = 3, rho_w = 0.5,
-  r = 0.8, pattern = "xsection")
-
-R_XD
+
R_XD <- blockDecayMat(ninds = 2 , nperiods = 3, rho_w = 0.5,
+  r = 0.8, pattern = "xsection")
+
+R_XD
##      [,1] [,2] [,3] [,4] [,5] [,6]
 ## [1,] 1.00 0.50  0.4  0.4 0.32 0.32
 ## [2,] 0.50 1.00  0.4  0.4 0.32 0.32
@@ -1184,8 +1184,8 @@ 

Cross-sectional data with correlation decay

## [4,] 0.40 0.40 0.5 1.0 0.40 0.40 ## [5,] 0.32 0.32 0.4 0.4 1.00 0.50 ## [6,] 0.32 0.32 0.4 0.4 0.50 1.00
-
dd <- genCorGen(n = 5000, nvars = 6, corMatrix = R_XD,
-  dist = "poisson", params1 = 7, wide = TRUE)
+
dd <- genCorGen(n = 5000, nvars = 6, corMatrix = R_XD,
+  dist = "poisson", params1 = 7, wide = TRUE)
##      V1   V2   V3   V4   V5   V6
 ## V1 1.00 0.51 0.38 0.41 0.31 0.31
 ## V2 0.51 1.00 0.38 0.39 0.30 0.30
@@ -1197,10 +1197,10 @@ 

Cross-sectional data with correlation decay

Cohort data with exchangeable correlation

Since we have a cohort, we introduce \(\rho_a\) = 0.4, and specify \(pattern = \text{"cohort"}\):

-
R_CE <- blockExchangeMat(ninds = 2 , nperiods = 3, rho_w = 0.5, 
-  rho_b = 0.3, rho_a = 0.4, pattern = "cohort")
-
-R_CE
+
R_CE <- blockExchangeMat(ninds = 2 , nperiods = 3, rho_w = 0.5, 
+  rho_b = 0.3, rho_a = 0.4, pattern = "cohort")
+
+R_CE
##      [,1] [,2] [,3] [,4] [,5] [,6]
 ## [1,]  1.0  0.5  0.4  0.3  0.4  0.3
 ## [2,]  0.5  1.0  0.3  0.4  0.3  0.4
@@ -1208,8 +1208,8 @@ 

Cohort data with exchangeable correlation

## [4,] 0.3 0.4 0.5 1.0 0.3 0.4 ## [5,] 0.4 0.3 0.4 0.3 1.0 0.5 ## [6,] 0.3 0.4 0.3 0.4 0.5 1.0
-
dd <- genCorGen(n = 5000, nvars = 6, corMatrix = R_CE,
-  dist = "poisson", params1 = 7, wide = TRUE)
+
dd <- genCorGen(n = 5000, nvars = 6, corMatrix = R_CE,
+  dist = "poisson", params1 = 7, wide = TRUE)
##      V1   V2   V3   V4   V5   V6
 ## V1 1.00 0.49 0.39 0.27 0.39 0.29
 ## V2 0.49 1.00 0.29 0.40 0.30 0.41
@@ -1224,10 +1224,10 @@ 

Cohort data with correlation decay

a cohort is the same as a decay in the case of a cross sectional design; the only difference that we set \(pattern = \text{"cohort"}\):

-
R_CD <- blockDecayMat(ninds = 2 , nperiods = 3, rho_w = 0.5, 
-  r = 0.8, pattern = "cohort")
-
-R_CD
+
R_CD <- blockDecayMat(ninds = 2 , nperiods = 3, rho_w = 0.5, 
+  r = 0.8, pattern = "cohort")
+
+R_CD
##      [,1] [,2] [,3] [,4] [,5] [,6]
 ## [1,] 1.00 0.50  0.8  0.4 0.64 0.32
 ## [2,] 0.50 1.00  0.4  0.8 0.32 0.64
@@ -1235,8 +1235,8 @@ 

Cohort data with correlation decay

## [4,] 0.40 0.80 0.5 1.0 0.40 0.80 ## [5,] 0.64 0.32 0.8 0.4 1.00 0.50 ## [6,] 0.32 0.64 0.4 0.8 0.50 1.00
-
dd <- genCorGen(n = 5000, nvars = 6, corMatrix = R_CD,
-  dist = "poisson", params1 = 7, wide = TRUE)
+
dd <- genCorGen(n = 5000, nvars = 6, corMatrix = R_CD,
+  dist = "poisson", params1 = 7, wide = TRUE)
##      V1   V2   V3   V4   V5   V6
 ## V1 1.00 0.47 0.79 0.38 0.63 0.30
 ## V2 0.47 1.00 0.38 0.78 0.30 0.62
@@ -1259,96 +1259,94 @@ 

Varying correlation matrices by cluster

beta distribution with shape parameters 6 and 2). The parameter \(\rho_w\) is constant across all clusters and is 0.6

-
defC <- defData(varname = "lambda", formula = "sample(5:10, 1)", dist = "nonrandom")
-defP <- defDataAdd(varname = "n", formula = "2;4", dist="uniformInt")
-
-dc <- genData(n = 10, dtDefs = defC, id = "site")
-dc <- addPeriods(dtName = dc, nPeriods = 3, 
-                 idvars = "site", perName = "period")
-dc <- addColumns(defP, dc)
-
-dd <- genCluster(dtClust = dc, cLevelVar = "timeID", 
-  numIndsVar = "n", level1ID = "id")
+
defC <- defData(varname = "lambda", formula = "sample(5:10, 1)", dist = "nonrandom")
+defP <- defDataAdd(varname = "n", formula = "2;4", dist="uniformInt")
+
+dc <- genData(n = 10, dtDefs = defC, id = "site")
+dc <- addPeriods(dtName = dc, nPeriods = 3, 
+                 idvars = "site", perName = "period")
+dc <- addColumns(defP, dc)
+
+dd <- genCluster(dtClust = dc, cLevelVar = "timeID", 
+  numIndsVar = "n", level1ID = "id")

In this example, the 10 clusters will have varying numbers of observations per period. Here are the counts for three sites:

-
dc[site %in% c(1, 3, 7), .(site, period, n)]
+
dc[site %in% c(1, 3, 7), .(site, period, n)]
##    site period n
 ## 1:    1      0 4
 ## 2:    1      1 2
-## 3:    1      2 2
+## 3:    1      2 3
 ## 4:    3      0 3
-## 5:    3      1 2
+## 5:    3      1 3
 ## 6:    3      2 3
-## 7:    7      0 4
-## 8:    7      1 4
-## 9:    7      2 4
+## 7: 7 0 3 +## 8: 7 1 2 +## 9: 7 2 3

The sites will also have unique decay rates:

-
r <- round(rbeta(10, 6, 2), 2)
-r[c(1, 3, 7)]
-
## [1] 0.76 0.94 0.87
+
r <- round(rbeta(10, 6, 2), 2)
+r[c(1, 3, 7)]
+
## [1] 0.66 0.85 0.81

Here are the correlation matrices for these three sites:

-
N <- dd[, .N, keyby = .(site, period)][, N]
-
-R <- blockDecayMat(ninds = N , nperiods = 3, rho_w = 0.6, r = r, nclusters = 10)
-
-lapply(R, function(x) round(x,2))[c(1, 3, 7)]
+
N <- dd[, .N, keyby = .(site, period)][, N]
+
+R <- blockDecayMat(ninds = N , nperiods = 3, rho_w = 0.6, r = r, nclusters = 10)
+
+lapply(R, function(x) round(x,2))[c(1, 3, 7)]
## [[1]]
-##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
-## [1,] 1.00 0.60 0.60 0.60 0.46 0.46 0.35 0.35
-## [2,] 0.60 1.00 0.60 0.60 0.46 0.46 0.35 0.35
-## [3,] 0.60 0.60 1.00 0.60 0.46 0.46 0.35 0.35
-## [4,] 0.60 0.60 0.60 1.00 0.46 0.46 0.35 0.35
-## [5,] 0.46 0.46 0.46 0.46 1.00 0.60 0.46 0.46
-## [6,] 0.46 0.46 0.46 0.46 0.60 1.00 0.46 0.46
-## [7,] 0.35 0.35 0.35 0.35 0.46 0.46 1.00 0.60
-## [8,] 0.35 0.35 0.35 0.35 0.46 0.46 0.60 1.00
+##       [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
+##  [1,] 1.00 0.60 0.60 0.60  0.4  0.4 0.26 0.26 0.26
+##  [2,] 0.60 1.00 0.60 0.60  0.4  0.4 0.26 0.26 0.26
+##  [3,] 0.60 0.60 1.00 0.60  0.4  0.4 0.26 0.26 0.26
+##  [4,] 0.60 0.60 0.60 1.00  0.4  0.4 0.26 0.26 0.26
+##  [5,] 0.40 0.40 0.40 0.40  1.0  0.6 0.40 0.40 0.40
+##  [6,] 0.40 0.40 0.40 0.40  0.6  1.0 0.40 0.40 0.40
+##  [7,] 0.26 0.26 0.26 0.26  0.4  0.4 1.00 0.60 0.60
+##  [8,] 0.26 0.26 0.26 0.26  0.4  0.4 0.60 1.00 0.60
+##  [9,] 0.26 0.26 0.26 0.26  0.4  0.4 0.60 0.60 1.00
 ## 
 ## [[2]]
-##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
-## [1,] 1.00 0.60 0.60 0.56 0.56 0.53 0.53 0.53
-## [2,] 0.60 1.00 0.60 0.56 0.56 0.53 0.53 0.53
-## [3,] 0.60 0.60 1.00 0.56 0.56 0.53 0.53 0.53
-## [4,] 0.56 0.56 0.56 1.00 0.60 0.56 0.56 0.56
-## [5,] 0.56 0.56 0.56 0.60 1.00 0.56 0.56 0.56
-## [6,] 0.53 0.53 0.53 0.56 0.56 1.00 0.60 0.60
-## [7,] 0.53 0.53 0.53 0.56 0.56 0.60 1.00 0.60
-## [8,] 0.53 0.53 0.53 0.56 0.56 0.60 0.60 1.00
+##       [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
+##  [1,] 1.00 0.60 0.60 0.51 0.51 0.51 0.43 0.43 0.43
+##  [2,] 0.60 1.00 0.60 0.51 0.51 0.51 0.43 0.43 0.43
+##  [3,] 0.60 0.60 1.00 0.51 0.51 0.51 0.43 0.43 0.43
+##  [4,] 0.51 0.51 0.51 1.00 0.60 0.60 0.51 0.51 0.51
+##  [5,] 0.51 0.51 0.51 0.60 1.00 0.60 0.51 0.51 0.51
+##  [6,] 0.51 0.51 0.51 0.60 0.60 1.00 0.51 0.51 0.51
+##  [7,] 0.43 0.43 0.43 0.51 0.51 0.51 1.00 0.60 0.60
+##  [8,] 0.43 0.43 0.43 0.51 0.51 0.51 0.60 1.00 0.60
+##  [9,] 0.43 0.43 0.43 0.51 0.51 0.51 0.60 0.60 1.00
 ## 
 ## [[3]]
-##       [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12]
-##  [1,] 1.00 0.60 0.60 0.60 0.52 0.52 0.52 0.52 0.45  0.45  0.45  0.45
-##  [2,] 0.60 1.00 0.60 0.60 0.52 0.52 0.52 0.52 0.45  0.45  0.45  0.45
-##  [3,] 0.60 0.60 1.00 0.60 0.52 0.52 0.52 0.52 0.45  0.45  0.45  0.45
-##  [4,] 0.60 0.60 0.60 1.00 0.52 0.52 0.52 0.52 0.45  0.45  0.45  0.45
-##  [5,] 0.52 0.52 0.52 0.52 1.00 0.60 0.60 0.60 0.52  0.52  0.52  0.52
-##  [6,] 0.52 0.52 0.52 0.52 0.60 1.00 0.60 0.60 0.52  0.52  0.52  0.52
-##  [7,] 0.52 0.52 0.52 0.52 0.60 0.60 1.00 0.60 0.52  0.52  0.52  0.52
-##  [8,] 0.52 0.52 0.52 0.52 0.60 0.60 0.60 1.00 0.52  0.52  0.52  0.52
-##  [9,] 0.45 0.45 0.45 0.45 0.52 0.52 0.52 0.52 1.00  0.60  0.60  0.60
-## [10,] 0.45 0.45 0.45 0.45 0.52 0.52 0.52 0.52 0.60  1.00  0.60  0.60
-## [11,] 0.45 0.45 0.45 0.45 0.52 0.52 0.52 0.52 0.60  0.60  1.00  0.60
-## [12,] 0.45 0.45 0.45 0.45 0.52 0.52 0.52 0.52 0.60  0.60  0.60  1.00
+## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] +## [1,] 1.00 0.60 0.60 0.49 0.49 0.39 0.39 0.39 +## [2,] 0.60 1.00 0.60 0.49 0.49 0.39 0.39 0.39 +## [3,] 0.60 0.60 1.00 0.49 0.49 0.39 0.39 0.39 +## [4,] 0.49 0.49 0.49 1.00 0.60 0.49 0.49 0.49 +## [5,] 0.49 0.49 0.49 0.60 1.00 0.49 0.49 0.49 +## [6,] 0.39 0.39 0.39 0.49 0.49 1.00 0.60 0.60 +## [7,] 0.39 0.39 0.39 0.49 0.49 0.60 1.00 0.60 +## [8,] 0.39 0.39 0.39 0.49 0.49 0.60 0.60 1.00

And here is code to generate the empirical correlation matrices for the three sites, based on 5000 replications of the data:

-
reps <- lapply(1:5000, 
-  function(x) addCorGen(dd, idvar = "site", corMatrix = R,
-    dist = "poisson", param1 = "lambda", cnames = "y")
-)
-
-drep <- data.table::rbindlist(reps, idcol = "rep")
-
-empir_corr <- function(cluster) {
-  dcrep <- drep[site == cluster, ]
-  dcrep[, seq := 1:.N, keyby = rep]
-  dmat <- as.matrix(dcast(dcrep, rep ~ seq, value.var = "y")[, -1])
-  
-  return(round(cor(dmat), 2))
-}
-
-
-empir_corr(cluster = 1)
-empir_corr(cluster = 3)
-empir_corr(cluster = 7)
+
reps <- lapply(1:5000, 
+  function(x) addCorGen(dd, idvar = "site", corMatrix = R,
+    dist = "poisson", param1 = "lambda", cnames = "y")
+)
+
+drep <- data.table::rbindlist(reps, idcol = "rep")
+
+empir_corr <- function(cluster) {
+  dcrep <- drep[site == cluster, ]
+  dcrep[, seq := 1:.N, keyby = rep]
+  dmat <- as.matrix(dcast(dcrep, rep ~ seq, value.var = "y")[, -1])
+  
+  return(round(cor(dmat), 2))
+}
+
+
+empir_corr(cluster = 1)
+empir_corr(cluster = 3)
+empir_corr(cluster = 7)

Reference:

Li, Fan, James P. Hughes, Karla Hemming, Monica Taljaard, Edward R. diff --git a/inst/doc/correlated.R b/inst/doc/correlated.R index 7bcd2ad..c45a4fd 100644 --- a/inst/doc/correlated.R +++ b/inst/doc/correlated.R @@ -1,4 +1,7 @@ -## ---- echo = FALSE, message = FALSE------------------------------------------- +## ----chunkname, echo=-1------------------------------------------------------- +data.table::setDTthreads(2) + +## ----echo = FALSE, message = FALSE-------------------------------------------- library(simstudy) library(ggplot2) library(scales) @@ -28,7 +31,7 @@ ggtheme <- function(panelback = "white") { } -## ---- tidy=TRUE--------------------------------------------------------------- +## ----tidy=TRUE---------------------------------------------------------------- # specifying a specific correlation matrix C C <- matrix(c(1,.7,.2, .7, 1, .8, .2, .8, 1),nrow = 3) C @@ -45,7 +48,7 @@ dt[,round(cor(cbind(V1, V2, V3)),1)] # estimate standard deviation dt[,round(sqrt(diag(var(cbind(V1, V2, V3)))),1)] -## ---- tidy=TRUE--------------------------------------------------------------- +## ----tidy=TRUE---------------------------------------------------------------- # generate 3 correlated variables with different location but same standard deviation # and compound symmetry (cs) correlation matrix with correlation coefficient = 0.4. # Other correlation matrix structures are "independent" ("ind") and "auto-regressive" ("ar1"). @@ -60,7 +63,7 @@ dt[,round(cor(cbind(x0, x1, x2)),1)] # estimate standard deviation dt[,round(sqrt(diag(var(cbind(x0, x1, x2)))),1)] -## ---- tidy = TRUE------------------------------------------------------------- +## ----tidy = TRUE-------------------------------------------------------------- # define and generate the original data set def <- defData(varname = "x", dist = "normal", formula = 0, variance = 1, id = "cid") diff --git a/inst/doc/correlated.Rmd b/inst/doc/correlated.Rmd index 9fd6ffe..5e0b48e 100644 --- a/inst/doc/correlated.Rmd +++ b/inst/doc/correlated.Rmd @@ -7,6 +7,10 @@ vignette: > \usepackage[utf8]{inputenc} --- +```{r chunkname, echo=-1} +data.table::setDTthreads(2) +``` + ```{r, echo = FALSE, message = FALSE} library(simstudy) library(ggplot2) diff --git a/inst/doc/correlated.html b/inst/doc/correlated.html index 12d76bb..9bad485 100644 --- a/inst/doc/correlated.html +++ b/inst/doc/correlated.html @@ -353,19 +353,19 @@

Correlated Data

corMatrix or a correlation coefficient rho and a correlation structure corsrt. Here are a few examples:

-
# specifying a specific correlation matrix C
-C <- matrix(c(1, 0.7, 0.2, 0.7, 1, 0.8, 0.2, 0.8, 1), nrow = 3)
-C
+
# specifying a specific correlation matrix C
+C <- matrix(c(1, 0.7, 0.2, 0.7, 1, 0.8, 0.2, 0.8, 1), nrow = 3)
+C
##      [,1] [,2] [,3]
 ## [1,]  1.0  0.7  0.2
 ## [2,]  0.7  1.0  0.8
 ## [3,]  0.2  0.8  1.0
-
set.seed(282726)
-
-# generate 3 correlated variables with different location and scale for each
-# field
-dt <- genCorData(1000, mu = c(4, 12, 3), sigma = c(1, 2, 3), corMatrix = C)
-dt
+
set.seed(282726)
+
+# generate 3 correlated variables with different location and scale for each
+# field
+dt <- genCorData(1000, mu = c(4, 12, 3), sigma = c(1, 2, 3), corMatrix = C)
+dt
##         id       V1       V2        V3
 ##    1:    1 4.125728 12.92567  3.328106
 ##    2:    2 4.712100 14.26502  8.876664
@@ -378,24 +378,24 @@ 

Correlated Data

## 998: 998 3.856643 13.17697 4.720628 ## 999: 999 4.738479 12.64438 2.979415 ## 1000: 1000 5.766867 13.51827 1.693172
-
# estimate correlation matrix
-dt[, round(cor(cbind(V1, V2, V3)), 1)]
+
# estimate correlation matrix
+dt[, round(cor(cbind(V1, V2, V3)), 1)]
##     V1  V2  V3
 ## V1 1.0 0.7 0.2
 ## V2 0.7 1.0 0.8
 ## V3 0.2 0.8 1.0
-
# estimate standard deviation
-dt[, round(sqrt(diag(var(cbind(V1, V2, V3)))), 1)]
+
# estimate standard deviation
+dt[, round(sqrt(diag(var(cbind(V1, V2, V3)))), 1)]
##  V1  V2  V3 
 ## 0.9 1.9 3.0
-
# generate 3 correlated variables with different location but same standard
-# deviation and compound symmetry (cs) correlation matrix with correlation
-# coefficient = 0.4.  Other correlation matrix structures are 'independent'
-# ('ind') and 'auto-regressive' ('ar1').
-
-dt <- genCorData(1000, mu = c(4, 12, 3), sigma = 3, rho = 0.4, corstr = "cs", cnames = c("x0",
-    "x1", "x2"))
-dt
+
# generate 3 correlated variables with different location but same standard
+# deviation and compound symmetry (cs) correlation matrix with correlation
+# coefficient = 0.4.  Other correlation matrix structures are 'independent'
+# ('ind') and 'auto-regressive' ('ar1').
+
+dt <- genCorData(1000, mu = c(4, 12, 3), sigma = 3, rho = 0.4, corstr = "cs", cnames = c("x0",
+    "x1", "x2"))
+dt
##         id        x0        x1         x2
 ##    1:    1 7.1160161 14.294748  4.0251237
 ##    2:    2 3.5429823  8.299333  4.5620657
@@ -408,28 +408,28 @@ 

Correlated Data

## 998: 998 3.3079075 11.909745 -0.7375013 ## 999: 999 3.7934154 10.515881 2.6021325 ## 1000: 1000 5.6413141 13.513672 7.5321371
-
# estimate correlation matrix
-dt[, round(cor(cbind(x0, x1, x2)), 1)]
+
# estimate correlation matrix
+dt[, round(cor(cbind(x0, x1, x2)), 1)]
##     x0  x1  x2
 ## x0 1.0 0.5 0.4
 ## x1 0.5 1.0 0.4
 ## x2 0.4 0.4 1.0
-
# estimate standard deviation
-dt[, round(sqrt(diag(var(cbind(x0, x1, x2)))), 1)]
+
# estimate standard deviation
+dt[, round(sqrt(diag(var(cbind(x0, x1, x2)))), 1)]
##  x0  x1  x2 
 ## 2.9 3.0 3.0

The new data generated by genCorData can be merged with an existing data set. Alternatively, addCorData will do this directly:

-
# define and generate the original data set
-def <- defData(varname = "x", dist = "normal", formula = 0, variance = 1, id = "cid")
-dt <- genData(1000, def)
-
-# add new correlate fields a0 and a1 to 'dt'
-dt <- addCorData(dt, idname = "cid", mu = c(0, 0), sigma = c(2, 0.2), rho = -0.2,
-    corstr = "cs", cnames = c("a0", "a1"))
-
-dt
+
# define and generate the original data set
+def <- defData(varname = "x", dist = "normal", formula = 0, variance = 1, id = "cid")
+dt <- genData(1000, def)
+
+# add new correlate fields a0 and a1 to 'dt'
+dt <- addCorData(dt, idname = "cid", mu = c(0, 0), sigma = c(2, 0.2), rho = -0.2,
+    corstr = "cs", cnames = c("a0", "a1"))
+
+dt
##        cid          x          a0          a1
 ##    1:    1 -0.4707940  0.97711194 -0.09127123
 ##    2:    2 -1.8723668  2.70498417 -0.27102780
@@ -442,13 +442,13 @@ 

Correlated Data

## 998: 998 0.1015744 -0.09821567 0.06440723 ## 999: 999 -0.5788317 0.16870967 -0.03890117 ## 1000: 1000 -1.6175613 3.61182553 -0.46220263
-
# estimate correlation matrix
-dt[, round(cor(cbind(a0, a1)), 1)]
+
# estimate correlation matrix
+dt[, round(cor(cbind(a0, a1)), 1)]
##      a0   a1
 ## a0  1.0 -0.2
 ## a1 -0.2  1.0
-
# estimate standard deviation
-dt[, round(sqrt(diag(var(cbind(a0, a1)))), 1)]
+
# estimate standard deviation
+dt[, round(sqrt(diag(var(cbind(a0, a1)))), 1)]
##  a0  a1 
 ## 2.0 0.2
@@ -466,10 +466,10 @@

Correlated data: additional distributions

distribution. We start by specifying the mean of the Poisson distribution for each new variable, and then we specify the correlation structure, just as we did with the normal distribution.

-
l <- c(8, 10, 12) # lambda for each new variable
-
-dx <- genCorGen(1000, nvars = 3, params1 = l, dist = "poisson", rho = .3, corstr = "cs", wide = TRUE)
-dx
+
l <- c(8, 10, 12) # lambda for each new variable
+
+dx <- genCorGen(1000, nvars = 3, params1 = l, dist = "poisson", rho = .3, corstr = "cs", wide = TRUE)
+dx
##         id V1 V2 V3
 ##    1:    1  5 16 13
 ##    2:    2  9  9  6
@@ -482,14 +482,14 @@ 

Correlated data: additional distributions

## 998: 998 6 8 12 ## 999: 999 10 12 11 ## 1000: 1000 9 9 12
-
round(cor(as.matrix(dx[, .(V1, V2, V3)])), 2)
+
round(cor(as.matrix(dx[, .(V1, V2, V3)])), 2)
##     V1   V2   V3
 ## V1 1.0 0.30 0.30
 ## V2 0.3 1.00 0.24
 ## V3 0.3 0.24 1.00

We can also generate correlated binary data by specifying the probabilities:

-
genCorGen(1000, nvars = 3, params1 = c(.3, .5, .7), dist = "binary", rho = .8, corstr = "cs", wide = TRUE)
+
genCorGen(1000, nvars = 3, params1 = c(.3, .5, .7), dist = "binary", rho = .8, corstr = "cs", wide = TRUE)
##         id V1 V2 V3
 ##    1:    1  0  1  1
 ##    2:    2  0  1  1
@@ -505,8 +505,8 @@ 

Correlated data: additional distributions

The gamma distribution requires two parameters - the mean and dispersion. (These are converted into shape and rate parameters more commonly used.)

-
dx <- genCorGen(1000, nvars = 3, params1 = l, params2 = c(1,1,1), dist = "gamma", rho = .7, corstr = "cs", wide = TRUE, cnames="a, b, c")
-dx
+
dx <- genCorGen(1000, nvars = 3, params1 = l, params2 = c(1,1,1), dist = "gamma", rho = .7, corstr = "cs", wide = TRUE, cnames="a, b, c")
+dx
##         id            a          b           c
 ##    1:    1 4.137889e+00  1.9736693  5.73317661
 ##    2:    2 6.230611e-04  0.1790216  0.01098133
@@ -519,7 +519,7 @@ 

Correlated data: additional distributions

## 998: 998 3.492819e+00 4.1504352 2.37403911 ## 999: 999 9.336809e+00 21.2184483 25.17933311 ## 1000: 1000 2.044966e+01 32.3326247 23.81715119
-
round(cor(as.matrix(dx[, .(a, b, c)])), 2)
+
round(cor(as.matrix(dx[, .(a, b, c)])), 2)
##      a    b    c
 ## a 1.00 0.63 0.67
 ## b 0.63 1.00 0.67
@@ -530,8 +530,8 @@ 

Correlated data: additional distributions

the long form, where the correlated data are on different rows, so that there are repeated measurements for each id. An id will have multiple records (i.e. one id will appear on multiple rows):

-
dx <- genCorGen(1000, nvars = 3, params1 = l, params2 = c(1,1,1), dist = "gamma", rho = .7, corstr = "cs", wide = FALSE, cnames="NewCol")
-dx
+
dx <- genCorGen(1000, nvars = 3, params1 = l, params2 = c(1,1,1), dist = "gamma", rho = .7, corstr = "cs", wide = FALSE, cnames="NewCol")
+dx
##         id period     NewCol
 ##    1:    1      0 0.08868527
 ##    2:    1      1 0.17558015
@@ -553,14 +553,14 @@ 

Correlated data: additional distributions

of correlated data with means that are a function of the variable xbase, which varies by id.

First we define the data and generate a data set:

-
def <- defData(varname = "xbase", formula = 5, variance = .2, dist = "gamma", id = "cid")
-def <- defData(def, varname = "lambda", formula = ".5 + .1*xbase", dist="nonrandom", link = "log")
-def <- defData(def, varname = "p", formula = "-2 + .3*xbase", dist="nonrandom", link = "logit")
-def <- defData(def, varname = "gammaMu", formula = ".5 + .2*xbase", dist="nonrandom", link = "log")
-def <- defData(def, varname = "gammaDis", formula = 1, dist="nonrandom")
-
-dt <- genData(10000, def)
-dt
+
def <- defData(varname = "xbase", formula = 5, variance = .2, dist = "gamma", id = "cid")
+def <- defData(def, varname = "lambda", formula = ".5 + .1*xbase", dist="nonrandom", link = "log")
+def <- defData(def, varname = "p", formula = "-2 + .3*xbase", dist="nonrandom", link = "logit")
+def <- defData(def, varname = "gammaMu", formula = ".5 + .2*xbase", dist="nonrandom", link = "log")
+def <- defData(def, varname = "gammaDis", formula = 1, dist="nonrandom")
+
+dt <- genData(10000, def)
+dt
##          cid    xbase   lambda         p  gammaMu gammaDis
 ##     1:     1 1.546326 1.924435 0.1771026 2.246257        1
 ##     2:     2 5.689908 2.912439 0.4272628 5.144775        1
@@ -574,121 +574,121 @@ 

Correlated data: additional distributions

## 9999: 9999 3.393940 2.314964 0.2725312 3.250432 1 ## 10000: 10000 7.722561 3.568895 0.5785365 7.725390 1

The Poisson distribution has a single parameter, lambda:

-
dtX1 <- addCorGen(dtOld = dt, idvar = "cid", nvars = 3, rho = .1, corstr = "cs",
-                    dist = "poisson", param1 = "lambda", cnames = "a, b, c")
-dtX1
-
##          cid    xbase   lambda         p  gammaMu gammaDis a b c
-##     1:     1 1.546326 1.924435 0.1771026 2.246257        1 2 2 2
-##     2:     2 5.689908 2.912439 0.4272628 5.144775        1 1 4 5
-##     3:     3 5.059867 2.734604 0.3817705 4.535672        1 4 2 1
-##     4:     4 4.599528 2.611573 0.3497493 4.136730        1 2 2 1
-##     5:     5 2.402442 2.096447 0.2176749 2.665758        1 1 3 2
-##    ---                                                          
-##  9996:  9996 3.610769 2.365707 0.2856166 3.394491        1 2 0 1
-##  9997:  9997 4.984305 2.714019 0.3764348 4.467643        1 4 2 4
-##  9998:  9998 5.122724 2.751847 0.3862310 4.593052        1 3 2 4
-##  9999:  9999 3.393940 2.314964 0.2725312 3.250432        1 1 3 0
-## 10000: 10000 7.722561 3.568895 0.5785365 7.725390        1 1 5 6
+
dtX1 <- addCorGen(dtOld = dt, idvar = "cid", nvars = 3, rho = .1, corstr = "cs",
+                    dist = "poisson", param1 = "lambda", cnames = "a, b, c")
+dtX1
+
##          cid    xbase   lambda         p  gammaMu gammaDis  a b c
+##     1:     1 1.546326 1.924435 0.1771026 2.246257        1  2 2 2
+##     2:     2 5.689908 2.912439 0.4272628 5.144775        1  1 0 2
+##     3:     3 5.059867 2.734604 0.3817705 4.535672        1  4 0 2
+##     4:     4 4.599528 2.611573 0.3497493 4.136730        1  3 1 3
+##     5:     5 2.402442 2.096447 0.2176749 2.665758        1  5 4 1
+##    ---                                                           
+##  9996:  9996 3.610769 2.365707 0.2856166 3.394491        1  1 2 2
+##  9997:  9997 4.984305 2.714019 0.3764348 4.467643        1  3 2 4
+##  9998:  9998 5.122724 2.751847 0.3862310 4.593052        1  7 2 1
+##  9999:  9999 3.393940 2.314964 0.2725312 3.250432        1  6 0 4
+## 10000: 10000 7.722561 3.568895 0.5785365 7.725390        1 12 1 3

The Bernoulli (binary) distribution has a single parameter, p:

-
dtX2 <- addCorGen(dtOld = dt, idvar = "cid", nvars = 4, rho = .4, corstr = "ar1",
-                    dist = "binary", param1 = "p")
-dtX2
+
dtX2 <- addCorGen(dtOld = dt, idvar = "cid", nvars = 4, rho = .4, corstr = "ar1",
+                    dist = "binary", param1 = "p")
+dtX2
##          cid    xbase   lambda         p  gammaMu gammaDis V1 V2 V3 V4
-##     1:     1 1.546326 1.924435 0.1771026 2.246257        1  0  0  0  0
-##     2:     2 5.689908 2.912439 0.4272628 5.144775        1  1  1  0  0
-##     3:     3 5.059867 2.734604 0.3817705 4.535672        1  0  0  0  0
-##     4:     4 4.599528 2.611573 0.3497493 4.136730        1  1  0  0  0
-##     5:     5 2.402442 2.096447 0.2176749 2.665758        1  0  0  1  0
+##     1:     1 1.546326 1.924435 0.1771026 2.246257        1  0  0  1  0
+##     2:     2 5.689908 2.912439 0.4272628 5.144775        1  0  0  0  1
+##     3:     3 5.059867 2.734604 0.3817705 4.535672        1  1  0  1  1
+##     4:     4 4.599528 2.611573 0.3497493 4.136730        1  0  0  0  1
+##     5:     5 2.402442 2.096447 0.2176749 2.665758        1  0  1  0  0
 ##    ---                                                                
-##  9996:  9996 3.610769 2.365707 0.2856166 3.394491        1  0  0  0  1
-##  9997:  9997 4.984305 2.714019 0.3764348 4.467643        1  0  1  0  1
+##  9996:  9996 3.610769 2.365707 0.2856166 3.394491        1  0  0  0  0
+##  9997:  9997 4.984305 2.714019 0.3764348 4.467643        1  0  1  1  0
 ##  9998:  9998 5.122724 2.751847 0.3862310 4.593052        1  0  0  0  1
 ##  9999:  9999 3.393940 2.314964 0.2725312 3.250432        1  0  0  0  0
 ## 10000: 10000 7.722561 3.568895 0.5785365 7.725390        1  0  0  1  0

The Gamma distribution has two parameters - in simstudy the mean and dispersion are specified:

-
dtX3 <- addCorGen(dtOld = dt, idvar = "cid", nvars = 4, rho = .4, corstr = "cs",
-                  dist = "gamma", param1 = "gammaMu", param2 = "gammaDis")
-dtX3
-
##          cid    xbase   lambda         p  gammaMu gammaDis         V1
-##     1:     1 1.546326 1.924435 0.1771026 2.246257        1 2.58709530
-##     2:     2 5.689908 2.912439 0.4272628 5.144775        1 0.07934567
-##     3:     3 5.059867 2.734604 0.3817705 4.535672        1 2.04176932
-##     4:     4 4.599528 2.611573 0.3497493 4.136730        1 8.25436952
-##     5:     5 2.402442 2.096447 0.2176749 2.665758        1 6.28127618
-##    ---                                                               
-##  9996:  9996 3.610769 2.365707 0.2856166 3.394491        1 6.64095852
-##  9997:  9997 4.984305 2.714019 0.3764348 4.467643        1 0.97710914
-##  9998:  9998 5.122724 2.751847 0.3862310 4.593052        1 6.51628773
-##  9999:  9999 3.393940 2.314964 0.2725312 3.250432        1 0.75152645
-## 10000: 10000 7.722561 3.568895 0.5785365 7.725390        1 1.39958999
-##                V2         V3         V4
-##     1:  2.1207941 7.37182655 1.49892176
-##     2: 15.2490909 0.01732443 0.04768341
-##     3:  0.1542492 4.43078463 2.76400220
-##     4:  2.3934427 1.88930272 2.23746274
-##     5:  2.5867935 6.63968812 2.03710012
-##    ---                                 
-##  9996:  1.8097451 2.95765819 4.63708493
-##  9997:  2.3576130 4.16571168 3.10540611
-##  9998:  6.8157548 3.81045976 3.76213866
-##  9999:  2.8992568 1.93141748 1.05365544
-## 10000:  4.8134691 2.38985964 1.75207835
+
dtX3 <- addCorGen(dtOld = dt, idvar = "cid", nvars = 4, rho = .4, corstr = "cs",
+                  dist = "gamma", param1 = "gammaMu", param2 = "gammaDis")
+dtX3
+
##          cid    xbase   lambda         p  gammaMu gammaDis          V1
+##     1:     1 1.546326 1.924435 0.1771026 2.246257        1 4.231680194
+##     2:     2 5.689908 2.912439 0.4272628 5.144775        1 6.787710358
+##     3:     3 5.059867 2.734604 0.3817705 4.535672        1 3.707544002
+##     4:     4 4.599528 2.611573 0.3497493 4.136730        1 0.348766326
+##     5:     5 2.402442 2.096447 0.2176749 2.665758        1 2.493923583
+##    ---                                                                
+##  9996:  9996 3.610769 2.365707 0.2856166 3.394491        1 0.002486867
+##  9997:  9997 4.984305 2.714019 0.3764348 4.467643        1 3.210241942
+##  9998:  9998 5.122724 2.751847 0.3862310 4.593052        1 9.556894110
+##  9999:  9999 3.393940 2.314964 0.2725312 3.250432        1 1.349413306
+## 10000: 10000 7.722561 3.568895 0.5785365 7.725390        1 2.404109193
+##               V2         V3         V4
+##     1: 1.2176380  0.4222348  2.1937488
+##     2: 4.6349836  6.5642077  3.1765033
+##     3: 1.5053746  1.1395938  1.0282041
+##     4: 7.7065647 15.1843651  1.5413112
+##     5: 0.6724540  2.0454487  1.5117235
+##    ---                                
+##  9996: 0.4825269  0.6716621  0.6721581
+##  9997: 0.2103485  4.9869686  3.8444291
+##  9998: 3.5482244 10.5174231  9.9577729
+##  9999: 2.3406277  1.7019004  0.8970987
+## 10000: 2.7987448  3.6667012 12.1680528

If we have data in long form (e.g. longitudinal data), the function will recognize the structure:

-
def <- defData(varname = "xbase", formula = 5, variance = .4, dist = "gamma", id = "cid")
-def <- defData(def, "nperiods", formula = 3, dist = "noZeroPoisson")
-
-def2 <- defDataAdd(varname = "lambda", formula = ".5+.5*period + .1*xbase", dist="nonrandom", link = "log")
-
-dt <- genData(1000, def)
-
-dtLong <- addPeriods(dt, idvars = "cid", nPeriods = 3)
-dtLong <- addColumns(def2, dtLong)
-
-dtLong
-
##        cid period     xbase nperiods timeID    lambda
-##    1:    1      0 11.345477        1      1  5.127138
-##    2:    1      1 11.345477        1      2  8.453222
-##    3:    1      2 11.345477        1      3 13.937007
-##    4:    2      0  2.859828        5      4  2.194563
-##    5:    2      1  2.859828        5      5  3.618222
-##   ---                                                
-## 2996:  999      1  5.790138        3   2996  4.850170
-## 2997:  999      2  5.790138        3   2997  7.996579
-## 2998: 1000      0 10.958208        3   2998  4.932376
-## 2999: 1000      1 10.958208        3   2999  8.132113
-## 3000: 1000      2 10.958208        3   3000 13.407588
-
### Generate the data 
-
-dtX3 <- addCorGen(dtOld = dtLong, idvar = "cid", nvars = 3, rho = .6, corstr = "cs",
-                  dist = "poisson", param1 = "lambda", cnames = "NewPois")
-dtX3
-
##        cid period     xbase nperiods timeID    lambda NewPois
-##    1:    1      0 11.345477        1      1  5.127138       5
-##    2:    1      1 11.345477        1      2  8.453222       6
-##    3:    1      2 11.345477        1      3 13.937007      18
-##    4:    2      0  2.859828        5      4  2.194563       3
-##    5:    2      1  2.859828        5      5  3.618222       5
-##   ---                                                        
-## 2996:  999      1  5.790138        3   2996  4.850170       4
-## 2997:  999      2  5.790138        3   2997  7.996579       6
-## 2998: 1000      0 10.958208        3   2998  4.932376       2
-## 2999: 1000      1 10.958208        3   2999  8.132113       4
-## 3000: 1000      2 10.958208        3   3000 13.407588       6
+
def <- defData(varname = "xbase", formula = 5, variance = .4, dist = "gamma", id = "cid")
+def <- defData(def, "nperiods", formula = 3, dist = "noZeroPoisson")
+
+def2 <- defDataAdd(varname = "lambda", formula = ".5+.5*period + .1*xbase", dist="nonrandom", link = "log")
+
+dt <- genData(1000, def)
+
+dtLong <- addPeriods(dt, idvars = "cid", nPeriods = 3)
+dtLong <- addColumns(def2, dtLong)
+
+dtLong
+
##        cid period    xbase nperiods timeID    lambda
+##    1:    1      0 7.053471        2      1  3.337917
+##    2:    1      1 7.053471        2      2  5.503295
+##    3:    1      2 7.053471        2      3  9.073400
+##    4:    2      0 2.185136        3      4  2.051382
+##    5:    2      1 2.185136        3      5  3.382157
+##   ---                                               
+## 2996:  999      1 9.702454        1   2996  7.172436
+## 2997:  999      2 9.702454        1   2997 11.825348
+## 2998: 1000      0 3.044209        4   2998  2.235402
+## 2999: 1000      1 3.044209        4   2999  3.685554
+## 3000: 1000      2 3.044209        4   3000  6.076452
+
### Generate the data 
+
+dtX3 <- addCorGen(dtOld = dtLong, idvar = "cid", nvars = 3, rho = .6, corstr = "cs",
+                  dist = "poisson", param1 = "lambda", cnames = "NewPois")
+dtX3
+
##        cid period    xbase nperiods timeID    lambda NewPois
+##    1:    1      0 7.053471        2      1  3.337917       5
+##    2:    1      1 7.053471        2      2  5.503295       6
+##    3:    1      2 7.053471        2      3  9.073400      11
+##    4:    2      0 2.185136        3      4  2.051382       2
+##    5:    2      1 2.185136        3      5  3.382157       1
+##   ---                                                       
+## 2996:  999      1 9.702454        1   2996  7.172436      10
+## 2997:  999      2 9.702454        1   2997 11.825348      11
+## 2998: 1000      0 3.044209        4   2998  2.235402       4
+## 2999: 1000      1 3.044209        4   2999  3.685554       7
+## 3000: 1000      2 3.044209        4   3000  6.076452       9

We can fit a generalized estimating equation (GEE) model and examine the coefficients and the working correlation matrix. They match closely to the data generating parameters:

-
geefit <- gee(NewPois ~ period + xbase, data = dtX3, id = cid, family = poisson, corstr = "exchangeable")
+
geefit <- gee(NewPois ~ period + xbase, data = dtX3, id = cid, family = poisson, corstr = "exchangeable")
## Beginning Cgee S-function, @(#) geeformula.q 4.13 98/01/27
## running glm to get initial regression estimate
## (Intercept)      period       xbase 
-##   0.4806614   0.4948942   0.1063911
-
round(summary(geefit)$working.correlation, 2)
+## 0.4915131 0.4929285 0.1030995
+
round(summary(geefit)$working.correlation, 2)
##      [,1] [,2] [,3]
-## [1,] 1.00 0.61 0.61
-## [2,] 0.61 1.00 0.61
-## [3,] 0.61 0.61 1.00
+## [1,] 1.00 0.56 0.56 +## [2,] 0.56 1.00 0.56 +## [3,] 0.56 0.56 1.00
diff --git a/inst/doc/double_dot_extension.R b/inst/doc/double_dot_extension.R index 4c7f763..8b8fcf7 100644 --- a/inst/doc/double_dot_extension.R +++ b/inst/doc/double_dot_extension.R @@ -1,4 +1,7 @@ -## ---- echo = FALSE, message = FALSE------------------------------------------- +## ----chunkname, echo=-1------------------------------------------------------- +data.table::setDTthreads(2) + +## ----echo = FALSE, message = FALSE-------------------------------------------- options(digits = 3) library(simstudy) @@ -81,7 +84,7 @@ fit <- summary(lm(y ~ x, data = dd)) coef(fit) fit$sigma -## ---- fig.width = 5----------------------------------------------------------- +## ----fig.width = 5------------------------------------------------------------ sigma2s <- c(1, 2, 6, 9) gen_data <- function(sigma2, d) { @@ -138,20 +141,20 @@ d1 <- defData(d1, varname = "b", formula = ".5;.5", variance = "1;2", dist = "ca d1 <- defData(d1, varname = "outcome", formula = "..effect[a, b]", dist="nonrandom") ## ----------------------------------------------------------------------------- -dx <- genData(8, d1) +dx <- genData(1000, d1) dx ## ----------------------------------------------------------------------------- d1 <- updateDef(d1, "outcome", newvariance = 9, newdist = "normal") dx <- genData(1000, d1) -## ---- echo=FALSE-------------------------------------------------------------- +## ----echo=FALSE--------------------------------------------------------------- dsum <- dx[, .(outcome=mean(outcome)), keyby = .(a, b)] ggplot(data = dx, aes(x = factor(a), y = outcome)) + geom_jitter(aes(color = factor(b)), width = .2, alpha = .4, size = .2) + geom_point(data = dsum, size = 2, aes(color = factor(b))) + - geom_line(data = dsum, size = 1, aes(color = factor(b), group = factor(b))) + + geom_line(data = dsum, linewidth = 1, aes(color = factor(b), group = factor(b))) + scale_color_manual(values = cbbPalette, name = " b") + theme(panel.grid = element_blank()) + xlab ("a") diff --git a/inst/doc/double_dot_extension.Rmd b/inst/doc/double_dot_extension.Rmd index 32a1c82..0ce7df5 100644 --- a/inst/doc/double_dot_extension.Rmd +++ b/inst/doc/double_dot_extension.Rmd @@ -7,6 +7,10 @@ vignette: > \usepackage[utf8]{inputenc} --- +```{r chunkname, echo=-1} +data.table::setDTthreads(2) +``` + ```{r, echo = FALSE, message = FALSE} options(digits = 3) @@ -196,7 +200,7 @@ d1 <- defData(d1, varname = "outcome", formula = "..effect[a, b]", dist="nonrand ``` ```{r} -dx <- genData(8, d1) +dx <- genData(1000, d1) dx ``` @@ -215,7 +219,7 @@ dsum <- dx[, .(outcome=mean(outcome)), keyby = .(a, b)] ggplot(data = dx, aes(x = factor(a), y = outcome)) + geom_jitter(aes(color = factor(b)), width = .2, alpha = .4, size = .2) + geom_point(data = dsum, size = 2, aes(color = factor(b))) + - geom_line(data = dsum, size = 1, aes(color = factor(b), group = factor(b))) + + geom_line(data = dsum, linewidth = 1, aes(color = factor(b), group = factor(b))) + scale_color_manual(values = cbbPalette, name = " b") + theme(panel.grid = element_blank()) + xlab ("a") diff --git a/inst/doc/double_dot_extension.html b/inst/doc/double_dot_extension.html index f0c4ca6..6c4f362 100644 --- a/inst/doc/double_dot_extension.html +++ b/inst/doc/double_dot_extension.html @@ -360,19 +360,19 @@

Updating existing definition tables

The original data set definition includes three variables x, y, and z, all normally distributed:

-
defs <- defData(varname = "x", formula = 0, variance = 3, dist = "normal")
-defs <- defData(defs, varname = "y", formula = "2 + 3*x", variance = 1, dist = "normal")
-defs <- defData(defs, varname = "z", formula = "4 + 3*x - 2*y", variance = 1, dist = "normal")
-
-defs
+
defs <- defData(varname = "x", formula = 0, variance = 3, dist = "normal")
+defs <- defData(defs, varname = "y", formula = "2 + 3*x", variance = 1, dist = "normal")
+defs <- defData(defs, varname = "z", formula = "4 + 3*x - 2*y", variance = 1, dist = "normal")
+
+defs
##    varname       formula variance   dist     link
 ## 1:       x             0        3 normal identity
 ## 2:       y       2 + 3*x        1 normal identity
 ## 3:       z 4 + 3*x - 2*y        1 normal identity

In the first case, we are changing the relationship of y with x as well as the variance:

-
defs <- updateDef(dtDefs = defs, changevar = "y", newformula = "x + 5", newvariance = 2)
-defs
+
defs <- updateDef(dtDefs = defs, changevar = "y", newformula = "x + 5", newvariance = 2)
+defs
##    varname       formula variance   dist     link
 ## 1:       x             0        3 normal identity
 ## 2:       y         x + 5        2 normal identity
@@ -380,8 +380,8 @@ 

Updating existing definition tables

In this second case, we are changing the distribution of z to Poisson and updating the link function to log:

-
defs <- updateDef(dtDefs = defs, changevar = "z", newdist = "poisson", newlink = "log")
-defs
+
defs <- updateDef(dtDefs = defs, changevar = "z", newdist = "poisson", newlink = "log")
+defs
##    varname       formula variance    dist     link
 ## 1:       x             0        3  normal identity
 ## 2:       y         x + 5        2  normal identity
@@ -391,8 +391,8 @@ 

Updating existing definition tables

defData that it is not possible to remove a variable that is a predictor of a subsequent variable, such as x or y in this case.

-
defs <- updateDef(dtDefs = defs, changevar = "z", remove = TRUE)
-defs
+
defs <- updateDef(dtDefs = defs, changevar = "z", remove = TRUE)
+defs
##    varname formula variance   dist     link
 ## 1:       x       0        3 normal identity
 ## 2:       y   x + 5        2 normal identity
@@ -405,63 +405,63 @@

Double-dot external variable reference

of hyperparameter of the data generation process. The reference is made directly in the formula itself, using a double-dot (“..”) notation before the variable name. Here is a simple example:

-
def <- defData(varname = "x", formula = 0, 
-  variance = 5, dist = "normal")
-def <- defData(def, varname = "y", formula = "..B0 + ..B1 * x", 
-  variance = "..sigma2", dist = "normal")
-
-def
+
def <- defData(varname = "x", formula = 0, 
+  variance = 5, dist = "normal")
+def <- defData(def, varname = "y", formula = "..B0 + ..B1 * x", 
+  variance = "..sigma2", dist = "normal")
+
+def
##    varname         formula variance   dist     link
 ## 1:       x               0        5 normal identity
 ## 2:       y ..B0 + ..B1 * x ..sigma2 normal identity
-
B0 <- 4;
-B1 <- 2;
-sigma2 <- 9
-
-set.seed(716251)
-
-dd <- genData(100, def)
-
-fit <- summary(lm(y ~ x, data = dd))
-
-coef(fit)
+
B0 <- 4;
+B1 <- 2;
+sigma2 <- 9
+
+set.seed(716251)
+
+dd <- genData(100, def)
+
+fit <- summary(lm(y ~ x, data = dd))
+
+coef(fit)
##             Estimate Std. Error t value Pr(>|t|)
 ## (Intercept)     4.00      0.284    14.1 2.56e-25
 ## x               2.01      0.130    15.4 5.90e-28
-
fit$sigma
+
fit$sigma
## [1] 2.83

It is easy to create a new data set on the fly with a difference variance assumption without having to go to the trouble of updating the data definitions.

-
sigma2 <- 16
-
-dd <- genData(100, def)
-fit <- summary(lm(y ~ x, data = dd))
-
-coef(fit)
+
sigma2 <- 16
+
+dd <- genData(100, def)
+fit <- summary(lm(y ~ x, data = dd))
+
+coef(fit)
##             Estimate Std. Error t value Pr(>|t|)
 ## (Intercept)     4.35      0.427   10.19 4.57e-17
 ## x               2.12      0.218    9.75 4.32e-16
-
fit$sigma
+
fit$sigma
## [1] 4.21

The double-dot notation can be flexibly applied using lapply (or the parallel version mclapply) to create a range of data sets under different assumptions:

-
sigma2s <- c(1, 2, 6, 9)
-
-gen_data <- function(sigma2, d) {
-  dd <- genData(200, d)
-  dd$sigma2 <- sigma2
-  dd
-}
-
-dd_4 <- lapply(sigma2s, function(s) gen_data(s, def))
-dd_4 <- rbindlist(dd_4)
-
-ggplot(data = dd_4, aes(x = x, y = y)) +
-  geom_point(size = .5, color = "grey30") +
-  facet_wrap(sigma2 ~ .) +
-  theme(panel.grid = element_blank())
+
sigma2s <- c(1, 2, 6, 9)
+
+gen_data <- function(sigma2, d) {
+  dd <- genData(200, d)
+  dd$sigma2 <- sigma2
+  dd
+}
+
+dd_4 <- lapply(sigma2s, function(s) gen_data(s, def))
+dd_4 <- rbindlist(dd_4)
+
+ggplot(data = dd_4, aes(x = x, y = y)) +
+  geom_point(size = .5, color = "grey30") +
+  facet_wrap(sigma2 ~ .) +
+  theme(panel.grid = element_blank())

@@ -471,14 +471,14 @@

Using non-scalar double-dot variable reference

(which we can also do using a categorical distribution), we can define the mixture formula in terms of the vector. In this case we are generating permuted block sizes of 2 and 4:

-
defblk <- defData(varname = "blksize", 
-   formula = "..sizes[1] | .5 + ..sizes[2] | .5", dist = "mixture")
-
-defblk
+
defblk <- defData(varname = "blksize", 
+   formula = "..sizes[1] | .5 + ..sizes[2] | .5", dist = "mixture")
+
+defblk
##    varname                           formula variance    dist     link
 ## 1: blksize ..sizes[1] | .5 + ..sizes[2] | .5        0 mixture identity
-
sizes <- c(2, 4)
-genData(1000, defblk)
+
sizes <- c(2, 4)
+genData(1000, defblk)
##         id blksize
 ##    1:    1       4
 ##    2:    2       4
@@ -496,18 +496,18 @@ 

Using non-scalar double-dot variable reference

weighted average of three numbers using tau as the weights. We could use the following code to estimate a weighted average theta:

-
tau <- rgamma(3, 5, 2)
-tau <- tau / sum(tau)
-tau
+
tau <- rgamma(3, 5, 2)
+tau <- tau / sum(tau)
+tau
## [1] 0.319 0.550 0.132
-
d <- defData(varname = "a", formula = 3, variance = 4)
-d <- defData(d, varname = "b", formula = 8, variance = 2)
-d <- defData(d, varname = "c", formula = 11, variance = 6)
-d <- defData(d, varname = "theta", formula = "..tau[1]*a + ..tau[2]*b + ..tau[3]*c", 
-  dist = "nonrandom")
-
-set.seed(1)
-genData(4, d)
+
d <- defData(varname = "a", formula = 3, variance = 4)
+d <- defData(d, varname = "b", formula = 8, variance = 2)
+d <- defData(d, varname = "c", formula = 11, variance = 6)
+d <- defData(d, varname = "theta", formula = "..tau[1]*a + ..tau[2]*b + ..tau[3]*c", 
+  dist = "nonrandom")
+
+set.seed(1)
+genData(4, d)
##    id    a    b    c theta
 ## 1:  1 1.75 8.47 12.4  6.84
 ## 2:  2 3.37 6.84 10.3  6.18
@@ -515,10 +515,10 @@ 

Using non-scalar double-dot variable reference

## 4: 4 6.19 9.04 12.0 8.52

We can simplify the calculation of theta by using matrix multiplication:

-
d <- updateDef(d, changevar = "theta", newformula = "t(..tau) %*% c(a, b, c)")
-
-set.seed(1)
-genData(4, d)
+
d <- updateDef(d, changevar = "theta", newformula = "t(..tau) %*% c(a, b, c)")
+
+set.seed(1)
+genData(4, d)
##    id    a    b    c theta
 ## 1:  1 1.75 8.47 12.4  6.84
 ## 2:  2 3.37 6.84 10.3  6.18
@@ -530,39 +530,37 @@ 

Using non-scalar double-dot variable reference

interventions \(a\) and \(b\), we can use a simple matrix and draw the means directly from the matrix, which in this example is stored in the variable effect:

-
effect <- matrix(c(0, 4, 5, 7), nrow = 2)
-effect
+
effect <- matrix(c(0, 4, 5, 7), nrow = 2)
+effect
##      [,1] [,2]
 ## [1,]    0    5
 ## [2,]    4    7

Using double dot notation, it is possible to reference the matrix cell values directly:

-
d1 <- defData(varname = "a", formula = ".5;.5", variance = "1;2", dist = "categorical")
-d1 <- defData(d1, varname = "b", formula = ".5;.5", variance = "1;2", dist = "categorical")
-d1 <- defData(d1, varname = "outcome", formula = "..effect[a, b]", dist="nonrandom")
-
dx <- genData(8, d1)
-dx
-
##    id a b outcome
-## 1:  1 2 2       7
-## 2:  2 2 2       7
-## 3:  3 2 1       4
-## 4:  4 2 1       4
-## 5:  5 1 1       0
-## 6:  6 2 2       7
-## 7:  7 2 1       4
-## 8:  8 1 2       5
+
d1 <- defData(varname = "a", formula = ".5;.5", variance = "1;2", dist = "categorical")
+d1 <- defData(d1, varname = "b", formula = ".5;.5", variance = "1;2", dist = "categorical")
+d1 <- defData(d1, varname = "outcome", formula = "..effect[a, b]", dist="nonrandom")
+
dx <- genData(1000, d1)
+dx
+
##         id a b outcome
+##    1:    1 2 2       7
+##    2:    2 2 1       4
+##    3:    3 2 1       4
+##    4:    4 2 2       7
+##    5:    5 1 2       5
+##   ---                 
+##  996:  996 2 2       7
+##  997:  997 2 2       7
+##  998:  998 1 1       0
+##  999:  999 2 2       7
+## 1000: 1000 1 2       5

It is possible to generate normally distributed data based on these means:

-
d1 <- updateDef(d1, "outcome", newvariance = 9, newdist = "normal")
-dx <- genData(1000, d1)
+
d1 <- updateDef(d1, "outcome", newvariance = 9, newdist = "normal")
+dx <- genData(1000, d1)

The plot shows the individual values as well as the mean values by intervention arm:

-
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
-## ℹ Please use `linewidth` instead.
-## This warning is displayed once every 8 hours.
-## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
-## generated.
-

+

diff --git a/inst/doc/logisticCoefs.R b/inst/doc/logisticCoefs.R new file mode 100644 index 0000000..b5da08f --- /dev/null +++ b/inst/doc/logisticCoefs.R @@ -0,0 +1,140 @@ +## ----chunkname, echo=-1------------------------------------------------------- +data.table::setDTthreads(2) + +## ----echo = FALSE, message = FALSE-------------------------------------------- +library(simstudy) +library(ggplot2) +library(scales) +library(grid) +library(gridExtra) +library(data.table) + +options(digits = 2) + +## ----------------------------------------------------------------------------- +library(simstudy) +library(ggplot2) +library(data.table) + +coefs1 <- c(0.15, 0.25, 0.10, 0.30) + +d1 <- defData(varname = "x1", formula = 0, variance = 1) +d1 <- defData(d1, varname = "x2", formula = 0, variance = 1) +d1 <- defData(d1, varname = "b1", formula = 0.3, dist = "binary") +d1 <- defData(d1, varname = "b2", formula = 0.7, dist = "binary") + +d1a <- defData(d1, varname = "y", + formula = "t(..coefs1) %*% c(x1, x2, b1, b2)", + dist = "binary", link = "logit") + +set.seed(48392) + +dd <- genData(500000, d1a) +dd + +## ----------------------------------------------------------------------------- +dd[, mean(y)] + +## ----------------------------------------------------------------------------- +d1a <- defData(d1, varname = "y", + formula = "t(c(-0.66, ..coefs1)) %*% c(1, x1, x2, b1, b2)", + dist = "binary", link = "logit") + +genData(500000, d1a)[, mean(y)] + +## ----------------------------------------------------------------------------- +coefs2 <- c(0.20, 0.35, 0.20, 0.45) + +d2 <- defData(varname = "x1", formula = 1, variance = 1) +d2 <- defData(d2, varname = "x2", formula = 3, variance = 1) +d2 <- defData(d2, varname = "b1", formula = 0.5, dist = "binary") +d2 <- defData(d2, varname = "b2", formula = 0.8, dist = "binary") + +d2a <- defData(d2, varname = "y", + formula = "t(..coefs2) %*% c(x1, x2, b1, b2)", + dist = "binary", link = "logit") + +genData(500000, d2a)[, mean(y)] + +## ----------------------------------------------------------------------------- +d2a <- defData(d2, varname = "y", + formula = "t(c(-2.13, ..coefs2)) %*% c(1, x1, x2, b1, b2)", + dist = "binary", link = "logit") + +genData(500000, d1a)[, mean(y)] + +## ----------------------------------------------------------------------------- +logisticCoefs(defCovar = d1, coefs = coefs1, popPrev = 0.40) +logisticCoefs(defCovar = d2, coefs = coefs2, popPrev = 0.40) + +## ----------------------------------------------------------------------------- +d1a <- defData(d1, varname = "rx", formula = "1;1", dist = "trtAssign") +d1a <- defData(d1a, varname = "y", + formula = "t(c(-0.40, ..coefs1)) %*% c(rx, x1, x2, b1, b2)", + dist = "binary", link = "logit" +) + +dd <- genData(500000, d1a) +dd[rx==1, mean(y)]/dd[rx==0, mean(y)] + +## ----------------------------------------------------------------------------- +d2a <- defData(d2, varname = "rx", formula = "1;1", dist = "trtAssign") +d2a <- defData(d2a, varname = "y", + formula = "t(c(-0.40, ..coefs2)) %*% c(rx, x1, x2, b1, b2)", + dist = "binary", link = "logit" +) + +dd <- genData(500000, d2a) +dd[rx==1, mean(y)]/dd[rx==0, mean(y)] + +## ----------------------------------------------------------------------------- +C1 <- logisticCoefs(d1, coefs1, popPrev = 0.40, rr = 0.85) +C1 + +## ----------------------------------------------------------------------------- +d1a <- defData(d1, varname = "rx", formula = "1;1", dist = "trtAssign") +d1a <- defData(d1a, varname = "y", + formula = "t(..C1) %*% c(1, rx, x1, x2, b1, b2)", + dist = "binary", link = "logit" +) + +dd <- genData(500000, d1a) + +## ----------------------------------------------------------------------------- +dd[rx==0, mean(y)] +dd[rx==1, mean(y)]/dd[rx==0, mean(y)] + +## ----------------------------------------------------------------------------- +C1 <- logisticCoefs(d1, coefs1, popPrev = 0.40, rd = -0.15) +C1 + +## ----------------------------------------------------------------------------- +d1a <- defData(d1, varname = "rx", formula = "1;1", dist = "trtAssign") +d1a <- defData(d1a, varname = "y", + formula = "t(..C1) %*% c(1, rx, x1, x2, b1, b2)", + dist = "binary", link = "logit" +) + +dd <- genData(500000, d1a) + +dd[rx==0, mean(y)] +dd[rx==1, mean(y)] - dd[rx==0, mean(y)] + +## ----------------------------------------------------------------------------- +C1 <- logisticCoefs(d1, coefs1, popPrev = 0.40, auc = 0.85) +C1 + +## ----------------------------------------------------------------------------- +d1a <- defData(d1, varname = "y", + formula = "t(..C1) %*% c(1, x1, x2, b1, b2)", + dist = "binary", link = "logit" +) + +dd <- genData(500000, d1a) + +dd[, mean(y)] + +fit <- rms::lrm(y ~ x1 + x2 + b1 + b2, data = dd) +fit$stats["C"] + + diff --git a/inst/doc/logisticCoefs.Rmd b/inst/doc/logisticCoefs.Rmd new file mode 100644 index 0000000..7e8c843 --- /dev/null +++ b/inst/doc/logisticCoefs.Rmd @@ -0,0 +1,248 @@ +--- +title: "Targeted logistic model coefficients" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Logistic Model Simulation} + %\VignetteEngine{knitr::rmarkdown} + \usepackage[utf8]{inputenc} +--- + +```{r chunkname, echo=-1} +data.table::setDTthreads(2) +``` + +```{r, echo = FALSE, message = FALSE} +library(simstudy) +library(ggplot2) +library(scales) +library(grid) +library(gridExtra) +library(data.table) + +options(digits = 2) +``` + +In `simstudy`, there are at least two ways to define a binary data generating process. The first is to operate on the scale of the proportion or probability using the *identity* link. This allows users to define a data generating process that reflects assumptions about risk ratios and risk differences when comparing two groups defined by an exposure or treatment. However, this process can become challenging when introducing other covariates, because it can be difficult to constrain the probabilities so that they fall between 0 and 1. + +The second approach works on the log-odds scale using a *logit* link, and is much more amenable to accommodating covariates. Unfortunately, this comes at the price of being able to easily generate specific risk ratios and risk differences, because all parameters are log-odds ratios. The overall (marginal) prevalence of an outcome in a population will vary depending on the distribution of covariates in that population, and the strengths (both absolute and relative) of the association of those covariates with the outcome. That is, the coefficients of a logistic model (including the intercept) determine the prevalence. The same is true regarding the risk ratio and risk difference (if there is one particular exposure or treatment of interest) and the AUC. + +Here we start with the simplest case where we have a target marginal proportion or prevalence, and then illustrate data generation with three other target statistics: **risk ratios**, **risk differences**, and **AUCs**. + +### Prevalence + +In this first example, we start with one set of assumptions for four covariates $x_1, x2 \sim N(0, 1)$, $b_1 \sim Bin(0.3)$, and $b_2 \sim Bin(0.7)$, and generate the outcome *y* with the following data generating process: + +$$ \text{logit}(y) = 0.15x_1 + 0.25x_2 + 0.10b_1 + 0.30b_2$$ +
+ +```{r} +library(simstudy) +library(ggplot2) +library(data.table) + +coefs1 <- c(0.15, 0.25, 0.10, 0.30) + +d1 <- defData(varname = "x1", formula = 0, variance = 1) +d1 <- defData(d1, varname = "x2", formula = 0, variance = 1) +d1 <- defData(d1, varname = "b1", formula = 0.3, dist = "binary") +d1 <- defData(d1, varname = "b2", formula = 0.7, dist = "binary") + +d1a <- defData(d1, varname = "y", + formula = "t(..coefs1) %*% c(x1, x2, b1, b2)", + dist = "binary", link = "logit") + +set.seed(48392) + +dd <- genData(500000, d1a) +dd +``` + +The overall proportion of $y=1$ in this case is + +```{r} +dd[, mean(y)] +``` + +If we have a desired marginal proportion of 0.40, then we can add an intercept of -0.66 to the data generating process: + +$$ \text{logit}(y) = -0.66 + 0.15x_1 + 0.25x_2 + 0.10b_1 + 0.30b_2$$ + +The simulation now gives us the desired target: + +```{r} +d1a <- defData(d1, varname = "y", + formula = "t(c(-0.66, ..coefs1)) %*% c(1, x1, x2, b1, b2)", + dist = "binary", link = "logit") + +genData(500000, d1a)[, mean(y)] +``` + +If we change the distribution of the covariates, so that $x_1 \sim N(1, 1)$, $x_2 \sim N(2, 1)$, $b_1 \sim Bin(0.5)$, and $b_2 \sim Bin(0.8)$, and the strength of the association of these covariates with the outcome so that + +$$ \text{logit}(y) = 0.20x_1 + 0.35x_2 + 0.20b_1 + 0.45b_2,$$ + +the marginal proportion/prevalence (assuming no intercept term) also changes, going from 0.56 to 0.84: + +```{r} +coefs2 <- c(0.20, 0.35, 0.20, 0.45) + +d2 <- defData(varname = "x1", formula = 1, variance = 1) +d2 <- defData(d2, varname = "x2", formula = 3, variance = 1) +d2 <- defData(d2, varname = "b1", formula = 0.5, dist = "binary") +d2 <- defData(d2, varname = "b2", formula = 0.8, dist = "binary") + +d2a <- defData(d2, varname = "y", + formula = "t(..coefs2) %*% c(x1, x2, b1, b2)", + dist = "binary", link = "logit") + +genData(500000, d2a)[, mean(y)] +``` + +But under this new distribution, adding an intercept of -2.13 yields the desired target. + +$$ \text{logit}(y) = -2.13 + 0.20x_1 + 0.35x_2 + 0.20b_1 + 0.45b_2 $$ + +
+ +```{r} +d2a <- defData(d2, varname = "y", + formula = "t(c(-2.13, ..coefs2)) %*% c(1, x1, x2, b1, b2)", + dist = "binary", link = "logit") + +genData(500000, d1a)[, mean(y)] +``` + +#### Finding the intercept + +Where did those two intercepts come from? The [paper](https://bmcmedresmethodol.biomedcentral.com/articles/10.1186/s12874-023-01836-5){target="_blank"} by Peter Austin describes an iterative bisection procedure that takes a distribution of covariates and a set of coefficients to identify the intercept coefficient that yields the target marginal proportion or prevalence. + +The general idea of the algorithm is to try out series of different intercepts in an intelligent way that ends up at the right spot. (If you want the details for the algorithm, take a look at the [paper](https://bmcmedresmethodol.biomedcentral.com/articles/10.1186/s12874-023-01836-5){target="_blank"}.) The starting search range is pre-defined (we've used -10 to 10 for the intercept), and we start with an value of 0 for the initial intercept and simulate a large data set (the paper uses 1 million observations, but 100,000 seems to work just fine) and record the population prevalence. If we've overshot the target prevalence, we turn our attention to the range between -10 and 0, taking the average, which is -5. Otherwise, we focus on the range between 0 and 10. We iterate this way, choosing the range we need to focus on and setting the intercept at the mid-point (hence the name *bisection*). The algorithm will converge pretty quickly on the value of the intercept that gives the target population prevalence for the underlying covariate distribution and coefficient assumptions. + +In the current implementation in `simstudy`, the intercept is provided by a simple call to `logisticCoefs`. Here are the calls for the two sets of definitions in definition tables *d1* and *d2*. + +```{r} +logisticCoefs(defCovar = d1, coefs = coefs1, popPrev = 0.40) +logisticCoefs(defCovar = d2, coefs = coefs2, popPrev = 0.40) +``` + +### Risk ratios + +Just as the prevalence depends on the distribution of covariates and their association with the outcome, risk ratios comparing the outcome probabilities for two groups also depend on the additional covariates. The marginal risk ratio comparing treatment ($A =1$ to control ($A=0$) (given the distribution of covariates) is + +$$RR = \frac{P(y=1 | A = 1)}{P(y=1 | A = 0)}$$ +In the data generation process we use a log-odds ratio of -0.40 (odds ratio of approximately 0.67) in both cases, but we get different risk ratios (0.82 vs. 0.93), depending on the covariates (defined in *d1* and *d2*). + +```{r} +d1a <- defData(d1, varname = "rx", formula = "1;1", dist = "trtAssign") +d1a <- defData(d1a, varname = "y", + formula = "t(c(-0.40, ..coefs1)) %*% c(rx, x1, x2, b1, b2)", + dist = "binary", link = "logit" +) + +dd <- genData(500000, d1a) +dd[rx==1, mean(y)]/dd[rx==0, mean(y)] +``` + +```{r} +d2a <- defData(d2, varname = "rx", formula = "1;1", dist = "trtAssign") +d2a <- defData(d2a, varname = "y", + formula = "t(c(-0.40, ..coefs2)) %*% c(rx, x1, x2, b1, b2)", + dist = "binary", link = "logit" +) + +dd <- genData(500000, d2a) +dd[rx==1, mean(y)]/dd[rx==0, mean(y)] +``` + +By specifying both a population prevalence and a target risk ratio in the call to `logisticCoefs`, we can get the necessary parameters. When specifying the target risk ratio, it is required to be between 0 and 1/popPrev. A risk ratio cannot be negative, and the probability of the outcome under treatment cannot exceed 1 (which will happen if the risk ratio is greater than 1/popPrev). + +```{r} +C1 <- logisticCoefs(d1, coefs1, popPrev = 0.40, rr = 0.85) +C1 +``` + +If we use $C_1$ in the data generation process, we will get a data set with the desired target prevalence and risk ratio: + +```{r} +d1a <- defData(d1, varname = "rx", formula = "1;1", dist = "trtAssign") +d1a <- defData(d1a, varname = "y", + formula = "t(..C1) %*% c(1, rx, x1, x2, b1, b2)", + dist = "binary", link = "logit" +) + +dd <- genData(500000, d1a) +``` + +Here are the prevalence and risk ratio: + +```{r} +dd[rx==0, mean(y)] +dd[rx==1, mean(y)]/dd[rx==0, mean(y)] +``` + +You can do the same for the second set of assumptions. + +### Risk differences + +Risk differences have the same set of issues, and are handled in the same way. The risk difference is defined as + +$$ RD = P(y=1 | A = 1) - P(y=1 | A = 0)$$ + +To get the coefficients related to a population prevalence of 0.40 and risk difference of -0.15 (so that the proportion in the exposure arm is 0.25), we use the *rd* argument: + +```{r} +C1 <- logisticCoefs(d1, coefs1, popPrev = 0.40, rd = -0.15) +C1 +``` + +Again, using $C_1$ in the data generation process, we will get a data set with the desired target prevalence and risk difference: + +```{r} +d1a <- defData(d1, varname = "rx", formula = "1;1", dist = "trtAssign") +d1a <- defData(d1a, varname = "y", + formula = "t(..C1) %*% c(1, rx, x1, x2, b1, b2)", + dist = "binary", link = "logit" +) + +dd <- genData(500000, d1a) + +dd[rx==0, mean(y)] +dd[rx==1, mean(y)] - dd[rx==0, mean(y)] +``` + +### AUC + +The AUC is another commonly used statistic to evaluate a logistic model. We can use `logisticCoefs` to find the parameters that will allow us to generate data from a model with a specific AUC. To get the coefficients related to a population prevalence of 0.40 and an AUC of 0.85, we use the *auc* argument (which must be between 0.5 and 1): + +```{r} +C1 <- logisticCoefs(d1, coefs1, popPrev = 0.40, auc = 0.85) +C1 +``` + +Again, using $C_1$ in the data generation process, we will get a data set with the desired target prevalence and the AUC (calculated here using the `lrm` function in the `rms` package: + +```{r} +d1a <- defData(d1, varname = "y", + formula = "t(..C1) %*% c(1, x1, x2, b1, b2)", + dist = "binary", link = "logit" +) + +dd <- genData(500000, d1a) + +dd[, mean(y)] + +fit <- rms::lrm(y ~ x1 + x2 + b1 + b2, data = dd) +fit$stats["C"] + +``` + + +

+References: + +Austin, Peter C. "The iterative bisection procedure: a useful +tool for determining parameter values in data-generating processes in +Monte Carlo simulations." BMC Medical Research Methodology 23, +no. 1 (2023): 1-10. + +

\ No newline at end of file diff --git a/inst/doc/logisticCoefs.html b/inst/doc/logisticCoefs.html new file mode 100644 index 0000000..d964bd6 --- /dev/null +++ b/inst/doc/logisticCoefs.html @@ -0,0 +1,625 @@ + + + + + + + + + + + + + + +Targeted logistic model coefficients + + + + + + + + + + + + + + + + + + + + + + + + + + +

Targeted logistic model coefficients

+ + + +

In simstudy, there are at least two ways to define a +binary data generating process. The first is to operate on the scale of +the proportion or probability using the identity link. This +allows users to define a data generating process that reflects +assumptions about risk ratios and risk differences when comparing two +groups defined by an exposure or treatment. However, this process can +become challenging when introducing other covariates, because it can be +difficult to constrain the probabilities so that they fall between 0 and +1.

+

The second approach works on the log-odds scale using a +logit link, and is much more amenable to accommodating +covariates. Unfortunately, this comes at the price of being able to +easily generate specific risk ratios and risk differences, because all +parameters are log-odds ratios. The overall (marginal) prevalence of an +outcome in a population will vary depending on the distribution of +covariates in that population, and the strengths (both absolute and +relative) of the association of those covariates with the outcome. That +is, the coefficients of a logistic model (including the intercept) +determine the prevalence. The same is true regarding the risk ratio and +risk difference (if there is one particular exposure or treatment of +interest) and the AUC.

+

Here we start with the simplest case where we have a target marginal +proportion or prevalence, and then illustrate data generation with three +other target statistics: risk ratios, risk +differences, and AUCs.

+
+

Prevalence

+

In this first example, we start with one set of assumptions for four +covariates \(x_1, x2 \sim N(0, 1)\), +\(b_1 \sim Bin(0.3)\), and \(b_2 \sim Bin(0.7)\), and generate the +outcome y with the following data generating process:

+

\[ \text{logit}(y) = 0.15x_1 + 0.25x_2 + +0.10b_1 + 0.30b_2\]

+
library(simstudy)
+library(ggplot2)
+library(data.table)
+
+coefs1 <- c(0.15, 0.25, 0.10, 0.30)
+
+d1 <- defData(varname = "x1", formula = 0, variance = 1)
+d1 <- defData(d1, varname = "x2", formula = 0, variance = 1)
+d1 <- defData(d1, varname = "b1", formula = 0.3, dist = "binary")
+d1 <- defData(d1, varname = "b2", formula = 0.7, dist = "binary")
+
+d1a <- defData(d1, varname = "y", 
+  formula = "t(..coefs1) %*% c(x1, x2, b1, b2)",
+  dist = "binary", link = "logit")
+
+set.seed(48392)
+
+dd <- genData(500000, d1a)
+dd
+
##             id    x1     x2 b1 b2 y
+##      1:      1  0.29  0.390  0  1 1
+##      2:      2  0.76 -0.925  0  0 0
+##      3:      3 -1.47  0.939  0  0 1
+##      4:      4  1.92  0.560  0  1 1
+##      5:      5  1.40 -0.238  0  1 0
+##     ---                            
+## 499996: 499996 -0.32  0.367  0  0 0
+## 499997: 499997 -1.08  2.152  0  0 0
+## 499998: 499998 -1.10  0.380  1  0 0
+## 499999: 499999  0.56 -1.042  0  1 0
+## 500000: 500000  0.52  0.076  0  1 1
+

The overall proportion of \(y=1\) in +this case is

+
dd[, mean(y)]
+
## [1] 0.56
+

If we have a desired marginal proportion of 0.40, then we can add an +intercept of -0.66 to the data generating process:

+

\[ \text{logit}(y) = -0.66 + 0.15x_1 + +0.25x_2 + 0.10b_1 + 0.30b_2\]

+

The simulation now gives us the desired target:

+
d1a <- defData(d1, varname = "y", 
+  formula = "t(c(-0.66, ..coefs1)) %*% c(1, x1, x2, b1, b2)",
+  dist = "binary", link = "logit")
+
+genData(500000, d1a)[, mean(y)]
+
## [1] 0.4
+

If we change the distribution of the covariates, so that \(x_1 \sim N(1, 1)\), \(x_2 \sim N(2, 1)\), \(b_1 \sim Bin(0.5)\), and \(b_2 \sim Bin(0.8)\), and the strength of +the association of these covariates with the outcome so that

+

\[ \text{logit}(y) = 0.20x_1 + 0.35x_2 + +0.20b_1 + 0.45b_2,\]

+

the marginal proportion/prevalence (assuming no intercept term) also +changes, going from 0.56 to 0.84:

+
coefs2 <- c(0.20, 0.35, 0.20, 0.45)
+
+d2 <- defData(varname = "x1", formula = 1, variance = 1)
+d2 <- defData(d2, varname = "x2", formula = 3, variance = 1)
+d2 <- defData(d2, varname = "b1", formula = 0.5, dist = "binary")
+d2 <- defData(d2, varname = "b2", formula = 0.8, dist = "binary")
+
+d2a <- defData(d2, varname = "y", 
+  formula = "t(..coefs2) %*% c(x1, x2, b1, b2)",
+  dist = "binary", link = "logit")
+
+genData(500000, d2a)[, mean(y)]
+
## [1] 0.84
+

But under this new distribution, adding an intercept of -2.13 yields +the desired target.

+

\[ \text{logit}(y) = -2.13 + 0.20x_1 + +0.35x_2 + 0.20b_1 + 0.45b_2 \]

+


+
d2a <- defData(d2, varname = "y", 
+  formula = "t(c(-2.13, ..coefs2)) %*% c(1, x1, x2, b1, b2)",
+  dist = "binary", link = "logit")
+
+genData(500000, d1a)[, mean(y)]
+
## [1] 0.4
+
+

Finding the intercept

+

Where did those two intercepts come from? The paper by Peter Austin describes an iterative +bisection procedure that takes a distribution of covariates and a set of +coefficients to identify the intercept coefficient that yields the +target marginal proportion or prevalence.

+

The general idea of the algorithm is to try out series of different +intercepts in an intelligent way that ends up at the right spot. (If you +want the details for the algorithm, take a look at the paper.) The starting search range is pre-defined +(we’ve used -10 to 10 for the intercept), and we start with an value of +0 for the initial intercept and simulate a large data set (the paper +uses 1 million observations, but 100,000 seems to work just fine) and +record the population prevalence. If we’ve overshot the target +prevalence, we turn our attention to the range between -10 and 0, taking +the average, which is -5. Otherwise, we focus on the range between 0 and +10. We iterate this way, choosing the range we need to focus on and +setting the intercept at the mid-point (hence the name +bisection). The algorithm will converge pretty quickly on the +value of the intercept that gives the target population prevalence for +the underlying covariate distribution and coefficient assumptions.

+

In the current implementation in simstudy, the intercept +is provided by a simple call to logisticCoefs. Here are the +calls for the two sets of definitions in definition tables d1 +and d2.

+
logisticCoefs(defCovar = d1, coefs = coefs1, popPrev = 0.40)
+
##    B0    x1    x2    b1    b2 
+## -0.66  0.15  0.25  0.10  0.30
+
logisticCoefs(defCovar = d2, coefs = coefs2, popPrev = 0.40)
+
##    B0    x1    x2    b1    b2 
+## -2.13  0.20  0.35  0.20  0.45
+
+
+
+

Risk ratios

+

Just as the prevalence depends on the distribution of covariates and +their association with the outcome, risk ratios comparing the outcome +probabilities for two groups also depend on the additional covariates. +The marginal risk ratio comparing treatment (\(A =1\) to control (\(A=0\)) (given the distribution of +covariates) is

+

\[RR = \frac{P(y=1 | A = 1)}{P(y=1 | A = +0)}\] In the data generation process we use a log-odds ratio of +-0.40 (odds ratio of approximately 0.67) in both cases, but we get +different risk ratios (0.82 vs. 0.93), depending on the covariates +(defined in d1 and d2).

+
d1a <- defData(d1, varname = "rx", formula = "1;1", dist = "trtAssign")
+d1a <- defData(d1a, varname = "y",
+  formula = "t(c(-0.40, ..coefs1)) %*% c(rx, x1, x2, b1, b2)",
+  dist = "binary", link = "logit"
+)
+
+dd <- genData(500000, d1a)
+dd[rx==1, mean(y)]/dd[rx==0, mean(y)]
+
## [1] 0.82
+
d2a <- defData(d2, varname = "rx", formula = "1;1", dist = "trtAssign")
+d2a <- defData(d2a, varname = "y",
+  formula = "t(c(-0.40, ..coefs2)) %*% c(rx, x1, x2, b1, b2)",
+  dist = "binary", link = "logit"
+)
+
+dd <- genData(500000, d2a)
+dd[rx==1, mean(y)]/dd[rx==0, mean(y)]
+
## [1] 0.93
+

By specifying both a population prevalence and a target risk ratio in +the call to logisticCoefs, we can get the necessary +parameters. When specifying the target risk ratio, it is required to be +between 0 and 1/popPrev. A risk ratio cannot be negative, and the +probability of the outcome under treatment cannot exceed 1 (which will +happen if the risk ratio is greater than 1/popPrev).

+
C1 <- logisticCoefs(d1, coefs1, popPrev = 0.40, rr = 0.85)
+C1
+
##    B0     A    x1    x2    b1    b2 
+## -0.66 -0.26  0.15  0.25  0.10  0.30
+

If we use \(C_1\) in the data +generation process, we will get a data set with the desired target +prevalence and risk ratio:

+
d1a <- defData(d1, varname = "rx", formula = "1;1", dist = "trtAssign")
+d1a <- defData(d1a, varname = "y",
+  formula = "t(..C1) %*% c(1, rx, x1, x2, b1, b2)",
+  dist = "binary", link = "logit"
+)
+
+dd <- genData(500000, d1a)
+

Here are the prevalence and risk ratio:

+
dd[rx==0, mean(y)]
+
## [1] 0.4
+
dd[rx==1, mean(y)]/dd[rx==0, mean(y)]
+
## [1] 0.86
+

You can do the same for the second set of assumptions.

+
+
+

Risk differences

+

Risk differences have the same set of issues, and are handled in the +same way. The risk difference is defined as

+

\[ RD = P(y=1 | A = 1) - P(y=1 | A = +0)\]

+

To get the coefficients related to a population prevalence of 0.40 +and risk difference of -0.15 (so that the proportion in the exposure arm +is 0.25), we use the rd argument:

+
C1 <- logisticCoefs(d1, coefs1, popPrev = 0.40, rd = -0.15)
+C1
+
##    B0     A    x1    x2    b1    b2 
+## -0.66 -0.71  0.15  0.25  0.10  0.30
+

Again, using \(C_1\) in the data +generation process, we will get a data set with the desired target +prevalence and risk difference:

+
d1a <- defData(d1, varname = "rx", formula = "1;1", dist = "trtAssign")
+d1a <- defData(d1a, varname = "y",
+  formula = "t(..C1) %*% c(1, rx, x1, x2, b1, b2)",
+  dist = "binary", link = "logit"
+)
+
+dd <- genData(500000, d1a)
+
+dd[rx==0, mean(y)]
+
## [1] 0.4
+
dd[rx==1, mean(y)] - dd[rx==0, mean(y)]
+
## [1] -0.15
+
+
+

AUC

+

The AUC is another commonly used statistic to evaluate a logistic +model. We can use logisticCoefs to find the parameters that +will allow us to generate data from a model with a specific AUC. To get +the coefficients related to a population prevalence of 0.40 and an AUC +of 0.85, we use the auc argument (which must be between 0.5 and +1):

+
C1 <- logisticCoefs(d1, coefs1, popPrev = 0.40, auc = 0.85)
+C1
+
##    B0    x1    x2    b1    b2 
+## -1.99  0.85  1.41  0.56  1.69
+

Again, using \(C_1\) in the data +generation process, we will get a data set with the desired target +prevalence and the AUC (calculated here using the lrm +function in the rms package:

+
d1a <- defData(d1, varname = "y",
+  formula = "t(..C1) %*% c(1, x1, x2, b1, b2)",
+  dist = "binary", link = "logit"
+)
+
+dd <- genData(500000, d1a)
+
+dd[, mean(y)]
+
## [1] 0.4
+
fit <- rms::lrm(y ~ x1 + x2 + b1 + b2, data = dd)
+fit$stats["C"]
+
##    C 
+## 0.85
+

+

References:

+

Austin, Peter C. “The iterative bisection procedure: a useful tool +for determining parameter values in data-generating processes in Monte +Carlo simulations.” BMC Medical Research Methodology 23, no. 1 (2023): +1-10.

+
+

+
+ + + + + + + + + + + diff --git a/inst/doc/longitudinal.R b/inst/doc/longitudinal.R index 73862a0..c862785 100644 --- a/inst/doc/longitudinal.R +++ b/inst/doc/longitudinal.R @@ -1,4 +1,7 @@ -## ---- echo = FALSE, message = FALSE------------------------------------------- +## ----chunkname, echo=-1------------------------------------------------------- +data.table::setDTthreads(2) + +## ----echo = FALSE, message = FALSE-------------------------------------------- library(simstudy) library(ggplot2) library(scales) @@ -28,7 +31,7 @@ ggtheme <- function(panelback = "white") { } -## ---- tidy = TRUE------------------------------------------------------------- +## ----tidy = TRUE-------------------------------------------------------------- tdef <- defData(varname = "T", dist="binary", formula = 0.5) tdef <- defData(tdef, varname = "Y0", dist = "normal", formula = 10, variance = 1) tdef <- defData(tdef, varname = "Y1", dist = "normal", formula = "Y0 + 5 + 5 * T", variance = 1) @@ -39,11 +42,11 @@ set.seed (483726) dtTrial <- genData( 500, tdef) dtTrial -## ---- tidy = TRUE------------------------------------------------------------- +## ----tidy = TRUE-------------------------------------------------------------- dtTime <- addPeriods(dtTrial, nPeriods = 3, idvars = "id", timevars = c("Y0", "Y1", "Y2"), timevarName = "Y") dtTime -## ---- tidy = TRUE, echo = FALSE, fig.width = 6, fig.height = 3---------------- +## ----tidy = TRUE, echo = FALSE, fig.width = 6, fig.height = 3----------------- avg <- dtTime[,.(Y=mean(Y)), keyby = .(T, period)] @@ -57,7 +60,7 @@ ggplot(data = dtTime, aes(x = factor(period), y = Y)) + ggtheme("grey90") + theme(legend.key=element_rect(fill=NA)) -## ---- tidy = TRUE------------------------------------------------------------- +## ----tidy = TRUE-------------------------------------------------------------- def <- defData(varname = "xbase", dist = "normal", formula = 20, variance = 3) def <- defData(def,varname = "nCount", dist = "noZeroPoisson", formula = 6) def <- defData(def, varname = "mInterval", dist = "gamma", formula = 30, variance = .01) @@ -66,15 +69,15 @@ def <- defData(def, varname = "vInterval", dist = "nonrandom", formula = .07) dt <- genData(200, def) dt[id %in% c(8,121)] # View individuals 8 and 121 -## ---- tidy = TRUE------------------------------------------------------------- +## ----tidy = TRUE-------------------------------------------------------------- dtPeriod <- addPeriods(dt) dtPeriod[id %in% c(8,121)] # View individuals 8 and 121 only -## ---- tidy = TRUE------------------------------------------------------------- +## ----tidy = TRUE-------------------------------------------------------------- def2 <- defDataAdd(varname = "Y", dist = "normal", formula = "15 + .1 * time", variance = 5) dtPeriod <- addColumns(def2, dtPeriod) -## ---- tidy = TRUE, echo = FALSE, fig.width = 6, fig.height = 3---------------- +## ----tidy = TRUE, echo = FALSE, fig.width = 6, fig.height = 3----------------- sampledID <- sample(1:nrow(dt), 5) dtSample <- dtPeriod[id %in% sampledID] diff --git a/inst/doc/longitudinal.Rmd b/inst/doc/longitudinal.Rmd index 084c2f5..7032300 100644 --- a/inst/doc/longitudinal.Rmd +++ b/inst/doc/longitudinal.Rmd @@ -7,6 +7,10 @@ vignette: > \usepackage[utf8]{inputenc} --- +```{r chunkname, echo=-1} +data.table::setDTthreads(2) +``` + ```{r, echo = FALSE, message = FALSE} library(simstudy) library(ggplot2) diff --git a/inst/doc/longitudinal.html b/inst/doc/longitudinal.html index f3ff8a1..752dc90 100644 --- a/inst/doc/longitudinal.html +++ b/inst/doc/longitudinal.html @@ -346,29 +346,29 @@

Longitudinal Data

the columns. In the next example, we measure outcome Y once before and twice after intervention T in a randomized trial:

-
tdef <- defData(varname = "T", dist = "binary", formula = 0.5)
-tdef <- defData(tdef, varname = "Y0", dist = "normal", formula = 10, variance = 1)
-tdef <- defData(tdef, varname = "Y1", dist = "normal", formula = "Y0 + 5 + 5 * T",
-    variance = 1)
-tdef <- defData(tdef, varname = "Y2", dist = "normal", formula = "Y0 + 10 + 5 * T",
-    variance = 1)
-
-set.seed(483726)
-
-dtTrial <- genData(500, tdef)
-dtTrial
-
##       id T    Y0   Y1   Y2
-##   1:   1 0 10.42 14.2 20.9
-##   2:   2 1  9.36 21.9 25.4
-##   3:   3 1  8.25 19.1 23.9
-##   4:   4 0 10.86 16.5 20.4
-##   5:   5 1 11.29 21.7 25.9
-##  ---                      
-## 496: 496 1 11.87 22.6 26.6
-## 497: 497 1 10.62 19.5 24.9
-## 498: 498 1 10.13 22.2 26.1
-## 499: 499 0 11.88 17.9 23.0
-## 500: 500 0 11.92 18.4 22.5
+
tdef <- defData(varname = "T", dist = "binary", formula = 0.5)
+tdef <- defData(tdef, varname = "Y0", dist = "normal", formula = 10, variance = 1)
+tdef <- defData(tdef, varname = "Y1", dist = "normal", formula = "Y0 + 5 + 5 * T",
+    variance = 1)
+tdef <- defData(tdef, varname = "Y2", dist = "normal", formula = "Y0 + 10 + 5 * T",
+    variance = 1)
+
+set.seed(483726)
+
+dtTrial <- genData(500, tdef)
+dtTrial
+
##       id T   Y0 Y1 Y2
+##   1:   1 0 10.4 14 21
+##   2:   2 1  9.4 22 25
+##   3:   3 1  8.3 19 24
+##   4:   4 0 10.9 16 20
+##   5:   5 1 11.3 22 26
+##  ---                 
+## 496: 496 1 11.9 23 27
+## 497: 497 1 10.6 20 25
+## 498: 498 1 10.1 22 26
+## 499: 499 0 11.9 18 23
+## 500: 500 0 11.9 18 23

Longitudinal data are created with a call to addPeriods. If the cross-sectional data includes time-dependent data, then the number of periods @@ -379,23 +379,28 @@

Longitudinal Data

time-dependent variable. (Note: if there are two time-dependent variables, it is best to create two data sets and merge them. This will be shown later in the vignette).

-
dtTime <- addPeriods(dtTrial, nPeriods = 3, idvars = "id", timevars = c("Y0", "Y1",
-    "Y2"), timevarName = "Y")
-dtTime
-
##        id period T     Y timeID
-##    1:   1      0 0 10.42      1
-##    2:   1      1 0 14.15      2
-##    3:   1      2 0 20.93      3
-##    4:   2      0 1  9.36      4
-##    5:   2      1 1 21.90      5
-##   ---                          
-## 1496: 499      1 0 17.94   1496
-## 1497: 499      2 0 23.05   1497
-## 1498: 500      0 0 11.92   1498
-## 1499: 500      1 0 18.39   1499
-## 1500: 500      2 0 22.50   1500
+
dtTime <- addPeriods(dtTrial, nPeriods = 3, idvars = "id", timevars = c("Y0", "Y1",
+    "Y2"), timevarName = "Y")
+dtTime
+
##        id period T    Y timeID
+##    1:   1      0 0 10.4      1
+##    2:   1      1 0 14.2      2
+##    3:   1      2 0 20.9      3
+##    4:   2      0 1  9.4      4
+##    5:   2      1 1 21.9      5
+##   ---                         
+## 1496: 499      1 0 17.9   1496
+## 1497: 499      2 0 23.0   1497
+## 1498: 500      0 0 11.9   1498
+## 1499: 500      1 0 18.4   1499
+## 1500: 500      2 0 22.5   1500

This is what the longitudinal data look like:

-

+
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
+## ℹ Please use `linewidth` instead.
+## This warning is displayed once every 8 hours.
+## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
+## generated.
+

Longitudinal data with varying observation and interval times

It is also possible to generate longitudinal data with varying @@ -415,36 +420,36 @@

Longitudinal data with varying observation and interval times

individuals with a different number of measurement observations and different times between each observation. Data for two of these individuals is printed:

-
def <- defData(varname = "xbase", dist = "normal", formula = 20, variance = 3)
-def <- defData(def, varname = "nCount", dist = "noZeroPoisson", formula = 6)
-def <- defData(def, varname = "mInterval", dist = "gamma", formula = 30, variance = 0.01)
-def <- defData(def, varname = "vInterval", dist = "nonrandom", formula = 0.07)
-
-dt <- genData(200, def)
-dt[id %in% c(8, 121)]  # View individuals 8 and 121
+
def <- defData(varname = "xbase", dist = "normal", formula = 20, variance = 3)
+def <- defData(def, varname = "nCount", dist = "noZeroPoisson", formula = 6)
+def <- defData(def, varname = "mInterval", dist = "gamma", formula = 30, variance = 0.01)
+def <- defData(def, varname = "vInterval", dist = "nonrandom", formula = 0.07)
+
+dt <- genData(200, def)
+dt[id %in% c(8, 121)]  # View individuals 8 and 121
##     id xbase nCount mInterval vInterval
-## 1:   8  18.0      4      28.1      0.07
-## 2: 121  22.7      6      33.2      0.07
+## 1: 8 18 4 28 0.07 +## 2: 121 23 6 33 0.07

The resulting longitudinal data for these two subjects can be inspected after a call to addPeriods. Notice that no parameters need to be set since all information resides in the data set itself:

-
dtPeriod <- addPeriods(dt)
-dtPeriod[id %in% c(8, 121)]  # View individuals 8 and 121 only
+
dtPeriod <- addPeriods(dt)
+dtPeriod[id %in% c(8, 121)]  # View individuals 8 and 121 only
##      id period xbase time timeID
-##  1:   8      0  18.0    0     41
-##  2:   8      1  18.0   29     42
-##  3:   8      2  18.0   51     43
-##  4:   8      3  18.0  104     44
-##  5: 121      0  22.7    0    691
-##  6: 121      1  22.7   46    692
-##  7: 121      2  22.7   81    693
-##  8: 121      3  22.7  117    694
-##  9: 121      4  22.7  154    695
-## 10: 121      5  22.7  180    696
+## 1: 8 0 18 0 41 +## 2: 8 1 18 29 42 +## 3: 8 2 18 51 43 +## 4: 8 3 18 104 44 +## 5: 121 0 23 0 691 +## 6: 121 1 23 46 692 +## 7: 121 2 23 81 693 +## 8: 121 3 23 117 694 +## 9: 121 4 23 154 695 +## 10: 121 5 23 180 696

If a time-sensitive measurement is added to the data set …

-
def2 <- defDataAdd(varname = "Y", dist = "normal", formula = "15 + .1 * time", variance = 5)
-dtPeriod <- addColumns(def2, dtPeriod)
+
def2 <- defDataAdd(varname = "Y", dist = "normal", formula = "15 + .1 * time", variance = 5)
+dtPeriod <- addColumns(def2, dtPeriod)

… a plot of five randomly selected individuals looks like this:

diff --git a/inst/doc/missing.R b/inst/doc/missing.R index 21feeb0..dd2f4f2 100644 --- a/inst/doc/missing.R +++ b/inst/doc/missing.R @@ -1,4 +1,7 @@ -## ---- echo = FALSE, message = FALSE------------------------------------------- +## ----chunkname, echo=-1------------------------------------------------------- +data.table::setDTthreads(2) + +## ----echo = FALSE, message = FALSE-------------------------------------------- library(simstudy) library(ggplot2) library(scales) @@ -55,7 +58,7 @@ ggtheme <- function(panelback = "white") { } -## ---- tidy = TRUE------------------------------------------------------------- +## ----tidy = TRUE-------------------------------------------------------------- def1 <- defData(varname = "m", dist = "binary", formula = .5) def1 <- defData(def1, "u", dist = "binary", formula = .5) def1 <- defData(def1, "x1", dist="normal", formula = "20*m + 20*u", variance = 2) @@ -64,7 +67,7 @@ def1 <- defData(def1, "x3", dist="normal", formula = "20*m + 20*u", variance = 2 dtAct <- genData(1000, def1) -## ---- tidy = TRUE------------------------------------------------------------- +## ----tidy = TRUE-------------------------------------------------------------- defM <- defMiss(varname = "x1", formula = .15, logit.link = FALSE) defM <- defMiss(defM, varname = "x2", formula = ".05 + m * 0.25", logit.link = FALSE) defM <- defMiss(defM, varname = "x3", formula = ".05 + u * 0.25", logit.link = FALSE) @@ -75,7 +78,7 @@ set.seed(283726) missMat <- genMiss(dtAct, defM, idvars = "id") dtObs <- genObs(dtAct, missMat, idvars = "id") -## ---- tidy = TRUE------------------------------------------------------------- +## ----tidy = TRUE-------------------------------------------------------------- missMat dtObs @@ -103,7 +106,7 @@ showDif(meanAct, meanObs) meanActm <- dtAct[,.(x1 = rmean(x1), x2 = rmean(x2), x3 = rmean(x3)), keyby = m] meanObsm <- dtObs[,.(x1 = rmean(x1), x2 = rmean(x2), x3 = rmean(x3)), keyby = m] -## ---- tidy = TRUE------------------------------------------------------------- +## ----tidy = TRUE-------------------------------------------------------------- # compare observed and actual when m = 0 showDif(meanActm[m==0, .(x1, x2, x3)], meanObsm[m==0, .(x1, x2, x3)]) @@ -112,7 +115,7 @@ showDif(meanActm[m==0, .(x1, x2, x3)], meanObsm[m==0, .(x1, x2, x3)]) showDif(meanActm[m==1, .(x1, x2, x3)], meanObsm[m==1, .(x1, x2, x3)]) -## ---- tidy = TRUE------------------------------------------------------------- +## ----tidy = TRUE-------------------------------------------------------------- # use baseline definitions from the previous example @@ -126,7 +129,7 @@ defLong <- defDataAdd(varname = "y", dist = "normal", formula = "10 + period*2 + dtTime <- addPeriods(dtAct, nPeriods = 4) dtTime <- addColumns(defLong, dtTime) -## ---- tidy = TRUE------------------------------------------------------------- +## ----tidy = TRUE-------------------------------------------------------------- # missingness for y is not monotonic diff --git a/inst/doc/missing.Rmd b/inst/doc/missing.Rmd index 4aa2f74..ce4f5d8 100644 --- a/inst/doc/missing.Rmd +++ b/inst/doc/missing.Rmd @@ -7,6 +7,10 @@ vignette: > \usepackage[utf8]{inputenc} --- +```{r chunkname, echo=-1} +data.table::setDTthreads(2) +``` + ```{r, echo = FALSE, message = FALSE} library(simstudy) library(ggplot2) diff --git a/inst/doc/missing.html b/inst/doc/missing.html index 28c4ea8..819dde2 100644 --- a/inst/doc/missing.html +++ b/inst/doc/missing.html @@ -363,13 +363,13 @@

Missing Data

includes two independent predictors, m and u that largely determine the value of each outcome (subject to random noise).

-
def1 <- defData(varname = "m", dist = "binary", formula = 0.5)
-def1 <- defData(def1, "u", dist = "binary", formula = 0.5)
-def1 <- defData(def1, "x1", dist = "normal", formula = "20*m + 20*u", variance = 2)
-def1 <- defData(def1, "x2", dist = "normal", formula = "20*m + 20*u", variance = 2)
-def1 <- defData(def1, "x3", dist = "normal", formula = "20*m + 20*u", variance = 2)
-
-dtAct <- genData(1000, def1)
+
def1 <- defData(varname = "m", dist = "binary", formula = 0.5)
+def1 <- defData(def1, "u", dist = "binary", formula = 0.5)
+def1 <- defData(def1, "x1", dist = "normal", formula = "20*m + 20*u", variance = 2)
+def1 <- defData(def1, "x2", dist = "normal", formula = "20*m + 20*u", variance = 2)
+def1 <- defData(def1, "x3", dist = "normal", formula = "20*m + 20*u", variance = 2)
+
+dtAct <- genData(1000, def1)

In this example, the missing data mechanism is different for each outcome. As defined below, missingness for x1 is MCAR, since the probability of missing is fixed. Missingness for @@ -378,16 +378,16 @@

Missing Data

for x3 is NMAR, since the probability of missing is dependent on u, an unmeasured predictor of x3:

-
defM <- defMiss(varname = "x1", formula = 0.15, logit.link = FALSE)
-defM <- defMiss(defM, varname = "x2", formula = ".05 + m * 0.25", logit.link = FALSE)
-defM <- defMiss(defM, varname = "x3", formula = ".05 + u * 0.25", logit.link = FALSE)
-defM <- defMiss(defM, varname = "u", formula = 1, logit.link = FALSE)  # not observed
-
-set.seed(283726)
-
-missMat <- genMiss(dtAct, defM, idvars = "id")
-dtObs <- genObs(dtAct, missMat, idvars = "id")
-
missMat
+
defM <- defMiss(varname = "x1", formula = 0.15, logit.link = FALSE)
+defM <- defMiss(defM, varname = "x2", formula = ".05 + m * 0.25", logit.link = FALSE)
+defM <- defMiss(defM, varname = "x3", formula = ".05 + u * 0.25", logit.link = FALSE)
+defM <- defMiss(defM, varname = "u", formula = 1, logit.link = FALSE)  # not observed
+
+set.seed(283726)
+
+missMat <- genMiss(dtAct, defM, idvars = "id")
+dtObs <- genObs(dtAct, missMat, idvars = "id")
+
missMat
##         id x1 x2 x3 u m
 ##    1:    1  0  0  0 1 0
 ##    2:    2  0  0  0 1 0
@@ -400,19 +400,19 @@ 

Missing Data

## 998: 998 0 0 1 1 0 ## 999: 999 0 0 0 1 0 ## 1000: 1000 0 0 0 1 0
-
dtObs
-
##         id m  u    x1      x2    x3
-##    1:    1 0 NA  2.37 -0.0938  1.79
-##    2:    2 0 NA 21.00 19.0643 21.76
-##    3:    3 0 NA    NA -2.1260 -2.04
-##    4:    4 0 NA    NA  0.6345  2.91
-##    5:    5 0 NA    NA      NA 17.11
-##   ---                              
-##  996:  996 0 NA 17.71 19.6705 18.74
-##  997:  997 1 NA    NA 40.8860    NA
-##  998:  998 0 NA 19.83 20.6475    NA
-##  999:  999 1 NA 20.29 23.6448 18.81
-## 1000: 1000 0 NA 20.99 20.3019 19.97
+
dtObs
+
##         id m  u   x1     x2   x3
+##    1:    1 0 NA  2.4 -0.094  1.8
+##    2:    2 0 NA 21.0 19.064 21.8
+##    3:    3 0 NA   NA -2.126 -2.0
+##    4:    4 0 NA   NA  0.635  2.9
+##    5:    5 0 NA   NA     NA 17.1
+##   ---                           
+##  996:  996 0 NA 17.7 19.671 18.7
+##  997:  997 1 NA   NA 40.886   NA
+##  998:  998 0 NA 19.8 20.647   NA
+##  999:  999 1 NA 20.3 23.645 18.8
+## 1000: 1000 0 NA 21.0 20.302 20.0

The impacts of the various data mechanisms on estimation can be seen with a simple calculation of means using both the “true” data set without missing data as a comparison for the “observed” data set. Since @@ -420,24 +420,24 @@

Missing Data

equivalent. However, we can see below that estimates for x2 and x3 are biased, as the difference between observed and actual is not close to 0:

-
# Two functions to calculate means and compare them
-
-rmean <- function(var, digits = 1) {
-    round(mean(var, na.rm = TRUE), digits)
-}
-
-showDif <- function(dt1, dt2, rowName = c("Actual", "Observed", "Difference")) {
-    dt <- data.frame(rbind(dt1, dt2, dt1 - dt2))
-    rownames(dt) <- rowName
-    return(dt)
-}
-
-# data.table functionality to estimate means for each data set
-
-meanAct <- dtAct[, .(x1 = rmean(x1), x2 = rmean(x2), x3 = rmean(x3))]
-meanObs <- dtObs[, .(x1 = rmean(x1), x2 = rmean(x2), x3 = rmean(x3))]
-
-showDif(meanAct, meanObs)
+
# Two functions to calculate means and compare them
+
+rmean <- function(var, digits = 1) {
+    round(mean(var, na.rm = TRUE), digits)
+}
+
+showDif <- function(dt1, dt2, rowName = c("Actual", "Observed", "Difference")) {
+    dt <- data.frame(rbind(dt1, dt2, dt1 - dt2))
+    rownames(dt) <- rowName
+    return(dt)
+}
+
+# data.table functionality to estimate means for each data set
+
+meanAct <- dtAct[, .(x1 = rmean(x1), x2 = rmean(x2), x3 = rmean(x3))]
+meanObs <- dtObs[, .(x1 = rmean(x1), x2 = rmean(x2), x3 = rmean(x3))]
+
+showDif(meanAct, meanObs)
##              x1   x2   x3
 ## Actual     19.8 19.8 19.8
 ## Observed   19.9 18.4 18.0
@@ -445,18 +445,18 @@ 

Missing Data

After adjusting for the measured covariate m, the bias for the estimate of the mean of x2 is mitigated, but not for x3, since u is not observed:

-
meanActm <- dtAct[, .(x1 = rmean(x1), x2 = rmean(x2), x3 = rmean(x3)), keyby = m]
-meanObsm <- dtObs[, .(x1 = rmean(x1), x2 = rmean(x2), x3 = rmean(x3)), keyby = m]
-
# compare observed and actual when m = 0
-
-showDif(meanActm[m == 0, .(x1, x2, x3)], meanObsm[m == 0, .(x1, x2, x3)])
+
meanActm <- dtAct[, .(x1 = rmean(x1), x2 = rmean(x2), x3 = rmean(x3)), keyby = m]
+meanObsm <- dtObs[, .(x1 = rmean(x1), x2 = rmean(x2), x3 = rmean(x3)), keyby = m]
+
# compare observed and actual when m = 0
+
+showDif(meanActm[m == 0, .(x1, x2, x3)], meanObsm[m == 0, .(x1, x2, x3)])
##              x1   x2  x3
 ## Actual      9.7  9.8 9.7
 ## Observed    9.8  9.9 8.4
 ## Difference -0.1 -0.1 1.3
-
# compare observed and actual when m = 1
-
-showDif(meanActm[m == 1, .(x1, x2, x3)], meanObsm[m == 1, .(x1, x2, x3)])
+
# compare observed and actual when m = 1
+
+showDif(meanActm[m == 1, .(x1, x2, x3)], meanObsm[m == 1, .(x1, x2, x3)])
##              x1   x2   x3
 ## Actual     30.4 30.4 30.4
 ## Observed   30.7 31.0 28.6
@@ -477,28 +477,28 @@ 

Longitudinal data with missingness

The following two examples describe an outcome variable y that is measured over time, whose value is a function of time and an observed exposure:

-
# use baseline definitions from the previous example
-
-dtAct <- genData(120, def1)
-dtAct <- trtObserve(dtAct, formulas = 0.5, logit.link = FALSE, grpName = "rx")
-
-# add longitudinal data
-
-defLong <- defDataAdd(varname = "y", dist = "normal", formula = "10 + period*2 + 2 * rx",
-    variance = 2)
-
-dtTime <- addPeriods(dtAct, nPeriods = 4)
-dtTime <- addColumns(defLong, dtTime)
+
# use baseline definitions from the previous example
+
+dtAct <- genData(120, def1)
+dtAct <- trtObserve(dtAct, formulas = 0.5, logit.link = FALSE, grpName = "rx")
+
+# add longitudinal data
+
+defLong <- defDataAdd(varname = "y", dist = "normal", formula = "10 + period*2 + 2 * rx",
+    variance = 2)
+
+dtTime <- addPeriods(dtAct, nPeriods = 4)
+dtTime <- addColumns(defLong, dtTime)

In the first case, missingness is not monotonic; a subject might miss a measurement but returns for subsequent measurements:

-
# missingness for y is not monotonic
-
-defMlong <- defMiss(varname = "x1", formula = 0.2, baseline = TRUE)
-defMlong <- defMiss(defMlong, varname = "y", formula = "-1.5 - 1.5 * rx + .25*period",
-    logit.link = TRUE, baseline = FALSE, monotonic = FALSE)
-
-missMatLong <- genMiss(dtTime, defMlong, idvars = c("id", "rx"), repeated = TRUE,
-    periodvar = "period")
+
# missingness for y is not monotonic
+
+defMlong <- defMiss(varname = "x1", formula = 0.2, baseline = TRUE)
+defMlong <- defMiss(defMlong, varname = "y", formula = "-1.5 - 1.5 * rx + .25*period",
+    logit.link = TRUE, baseline = FALSE, monotonic = FALSE)
+
+missMatLong <- genMiss(dtTime, defMlong, idvars = c("id", "rx"), repeated = TRUE,
+    periodvar = "period")

Here is a conceptual plot that shows the pattern of missingness. Each row represents an individual, and each box represents a time period. A box that is colored reflects missing data; a box colored grey reflects @@ -508,14 +508,14 @@

Longitudinal data with missingness

In the second case, missingness is monotonic; once a subject misses a measurement for y, there are no subsequent measurements:

-
# missingness for y is monotonic
-
-defMlong <- defMiss(varname = "x1", formula = 0.2, baseline = TRUE)
-defMlong <- defMiss(defMlong, varname = "y", formula = "-1.8 - 1.5 * rx + .25*period",
-    logit.link = TRUE, baseline = FALSE, monotonic = TRUE)
-
-missMatLong <- genMiss(dtTime, defMlong, idvars = c("id", "rx"), repeated = TRUE,
-    periodvar = "period")
+
# missingness for y is monotonic
+
+defMlong <- defMiss(varname = "x1", formula = 0.2, baseline = TRUE)
+defMlong <- defMiss(defMlong, varname = "y", formula = "-1.8 - 1.5 * rx + .25*period",
+    logit.link = TRUE, baseline = FALSE, monotonic = TRUE)
+
+missMatLong <- genMiss(dtTime, defMlong, idvars = c("id", "rx"), repeated = TRUE,
+    periodvar = "period")

diff --git a/inst/doc/ordinal.R b/inst/doc/ordinal.R index b73b0d3..1fac4c4 100644 --- a/inst/doc/ordinal.R +++ b/inst/doc/ordinal.R @@ -1,4 +1,7 @@ -## ---- echo = FALSE, message = FALSE------------------------------------------- +## ----chunkname, echo=-1------------------------------------------------------- +data.table::setDTthreads(2) + +## ----echo = FALSE, message = FALSE-------------------------------------------- library(simstudy) library(ggplot2) library(scales) @@ -159,7 +162,7 @@ summary(clmFit) ## ----------------------------------------------------------------------------- logOdds.expos - logOdds.unexp -## ---- echo=FALSE, fig.width=6, fig.height=3.5--------------------------------- +## ----echo=FALSE, fig.width=6, fig.height=3.5---------------------------------- getCumProbs <- function(coefs) { cumprob0 <- data.table( @@ -264,7 +267,7 @@ dX <- genOrdCat(dT_1_cat, baseprobs = baseprobs, adjVar = "z", ## ----------------------------------------------------------------------------- logOdds.expos - logOdds.unexp -## ---- echo=FALSE, fig.width=6, fig.height=3.5, warning=FALSE------------------ +## ----echo=FALSE, fig.width=6, fig.height=3.5, warning=FALSE------------------- fitPlot(dX) ## ----------------------------------------------------------------------------- diff --git a/inst/doc/ordinal.Rmd b/inst/doc/ordinal.Rmd index 162d5f6..ba365ff 100644 --- a/inst/doc/ordinal.Rmd +++ b/inst/doc/ordinal.Rmd @@ -7,6 +7,10 @@ vignette: > \usepackage[utf8]{inputenc} --- +```{r chunkname, echo=-1} +data.table::setDTthreads(2) +``` + ```{r, echo = FALSE, message = FALSE} library(simstudy) library(ggplot2) diff --git a/inst/doc/ordinal.html b/inst/doc/ordinal.html index 041067c..e754852 100644 --- a/inst/doc/ordinal.html +++ b/inst/doc/ordinal.html @@ -414,21 +414,21 @@

The cumulative proportional odds model

Simulation

To generate ordered categorical data using simstudy, there is a function genOrdCat.

-
baseprobs <- c(0.31, 0.29, .20, 0.20)
-
-defA <- defData(varname = "exposed", formula = "1;1", dist = "trtAssign")
-defA <- defData(defA, varname = "z", formula = "-0.7*exposed", dist = "nonrandom")
-
-set.seed(130)
-
-dT_1_cat <- genData(25000, defA)
-
-dX <- genOrdCat(dT_1_cat, adjVar = "z", baseprobs, catVar = "r")
+
baseprobs <- c(0.31, 0.29, .20, 0.20)
+
+defA <- defData(varname = "exposed", formula = "1;1", dist = "trtAssign")
+defA <- defData(defA, varname = "z", formula = "-0.7*exposed", dist = "nonrandom")
+
+set.seed(130)
+
+dT_1_cat <- genData(25000, defA)
+
+dX <- genOrdCat(dT_1_cat, adjVar = "z", baseprobs, catVar = "r")

Estimating the parameters of the model using function clm, we can recover the original parameters quite well.

-
library(ordinal)
-clmFit <- clm(r ~ exposed, data = dX)
-summary(clmFit)
+
library(ordinal)
+clmFit <- clm(r ~ exposed, data = dX)
+summary(clmFit)
## formula: r ~ exposed
 ## data:    dX
 ## 
@@ -453,16 +453,16 @@ 

Simulation

match the thresholds for the unexposed group.

The log of the cumulative odds for groups 1 to 4 from the data without exposure are

-
(logOdds.unexp <- log(odds(cumsum(dX[exposed == 0, prop.table(table(r))])))[1:3])
+
(logOdds.unexp <- log(odds(cumsum(dX[exposed == 0, prop.table(table(r))])))[1:3])
##     1     2     3 
 ## -0.83  0.38  1.36

And under exposure:

-
(logOdds.expos <- log(odds(cumsum(dX[exposed == 1, prop.table(table(r))])))[1:3])
+
(logOdds.expos <- log(odds(cumsum(dX[exposed == 1, prop.table(table(r))])))[1:3])
##      1      2      3 
 ## -0.084  1.135  2.147

The log of the cumulative odds ratios for each of the four groups is

-
logOdds.expos - logOdds.unexp
+
logOdds.expos - logOdds.unexp
##    1    2    3 
 ## 0.75 0.76 0.79

A plot of the modeled cumulative probabilities (the lines) shows that @@ -488,22 +488,22 @@

Non-proportional odds

violated, and npAdj indicates the various shifts of the intervals. (Note that the last cut point is at Inf, so there is no impact of a shift related to that threshold.)

-
baseprobs <- c(0.31, 0.29, .20, 0.20)
-npAdj <- c(-0.4, 0.0, -1.7, 0)
-
-dX <- genOrdCat(dT_1_cat, baseprobs = baseprobs, adjVar = "z",
-                catVar = "r", npVar = "exposed", npAdj = npAdj)
+
baseprobs <- c(0.31, 0.29, .20, 0.20)
+npAdj <- c(-0.4, 0.0, -1.7, 0)
+
+dX <- genOrdCat(dT_1_cat, baseprobs = baseprobs, adjVar = "z",
+                catVar = "r", npVar = "exposed", npAdj = npAdj)

The calculation of the log cumulative odds follows as before:

-
(logOdds.unexp <- log(odds(cumsum(dX[exposed == 0, prop.table(table(r))])))[1:3])
+
(logOdds.unexp <- log(odds(cumsum(dX[exposed == 0, prop.table(table(r))])))[1:3])
##     1     2     3 
 ## -0.80  0.42  1.40

And under exposure:

-
(logOdds.expos <- log(odds(cumsum(dX[exposed == 1, prop.table(table(r))])))[1:3])
+
(logOdds.expos <- log(odds(cumsum(dX[exposed == 1, prop.table(table(r))])))[1:3])
##    1    2    3 
 ## 0.29 1.10 3.72

But, now, the log of the cumulative odds ratios for each of the four groups varies across the different levels.

-
logOdds.expos - logOdds.unexp
+
logOdds.expos - logOdds.unexp
##    1    2    3 
 ## 1.09 0.69 2.32

This is confirmed by a plot of the model fit with a proportional odds @@ -532,23 +532,23 @@

Correlated multivariate ordinal data

possible responses: “none”, “some”, “a lot”. The probabilities of response are specified in a \(5 \times 3\) matrix, and the rows sum to 1:

-
baseprobs <- matrix(c(0.2, 0.1, 0.7,
-                      0.7, 0.2, 0.1,
-                      0.5, 0.2, 0.3,
-                      0.4, 0.2, 0.4,
-                      0.6, 0.2, 0.2), 
-                    nrow = 5, byrow = TRUE)
-
-# generate the data
-
-set.seed(333)                     
-dT_5_cat <- genData(10000)
-
-dX <- genOrdCat(dT_5_cat, adjVar = NULL, baseprobs = baseprobs, 
-                   prefix = "q", rho = 0.15, corstr = "cs", asFactor = FALSE)
+
baseprobs <- matrix(c(0.2, 0.1, 0.7,
+                      0.7, 0.2, 0.1,
+                      0.5, 0.2, 0.3,
+                      0.4, 0.2, 0.4,
+                      0.6, 0.2, 0.2), 
+                    nrow = 5, byrow = TRUE)
+
+# generate the data
+
+set.seed(333)                     
+dT_5_cat <- genData(10000)
+
+dX <- genOrdCat(dT_5_cat, adjVar = NULL, baseprobs = baseprobs, 
+                   prefix = "q", rho = 0.15, corstr = "cs", asFactor = FALSE)

The observed correlation of the items is slightly less than the specified correlations as expected:

-
round(dX[, cor(cbind(q1, q2, q3, q4, q5))], 2)
+
round(dX[, cor(cbind(q1, q2, q3, q4, q5))], 2)
##      q1   q2   q3   q4   q5
 ## q1 1.00 0.08 0.10 0.10 0.08
 ## q2 0.08 1.00 0.09 0.09 0.09
@@ -557,20 +557,20 @@ 

Correlated multivariate ordinal data

## q5 0.08 0.09 0.11 0.10 1.00

However, the marginal probability distributions of each item match quite closely with the specified probabilities:

-
dM <- melt(dX, id.vars = "id")
-dProp <- dM[ , prop.table(table(value)), by = variable]
-dProp[, response := rep(seq(3), 5)]
-
-# observed probabilities
-dcast(dProp, variable ~ response, value.var = "V1", fill = 0)
+
dM <- melt(dX, id.vars = "id")
+dProp <- dM[ , prop.table(table(value)), by = variable]
+dProp[, response := rep(seq(3), 5)]
+
+# observed probabilities
+dcast(dProp, variable ~ response, value.var = "V1", fill = 0)
##    variable    1    2    3
 ## 1:       q1 0.20 0.10 0.70
 ## 2:       q2 0.69 0.21 0.10
 ## 3:       q3 0.50 0.20 0.30
 ## 4:       q4 0.40 0.20 0.40
 ## 5:       q5 0.60 0.20 0.21
-
# specified probabilities
-baseprobs
+
# specified probabilities
+baseprobs
##      [,1] [,2] [,3]
 ## [1,]  0.2  0.1  0.7
 ## [2,]  0.7  0.2  0.1
@@ -581,23 +581,23 @@ 

Correlated multivariate ordinal data

AR-1, so the correlation between questions closer to each other is higher than for questions farther apart. But the probability distributions are unaffected:

-
dX <- genOrdCat(dT_5_cat, adjVar = NULL, baseprobs = baseprobs, 
-                   prefix = "q", rho = 0.40, corstr = "ar1", asFactor = FALSE)
-
-# correlation
-round(dX[, cor(cbind(q1, q2, q3, q4, q5))], 2)
+
dX <- genOrdCat(dT_5_cat, adjVar = NULL, baseprobs = baseprobs, 
+                   prefix = "q", rho = 0.40, corstr = "ar1", asFactor = FALSE)
+
+# correlation
+round(dX[, cor(cbind(q1, q2, q3, q4, q5))], 2)
##      q1   q2   q3   q4   q5
 ## q1 1.00 0.22 0.10 0.05 0.02
 ## q2 0.22 1.00 0.29 0.10 0.03
 ## q3 0.10 0.29 1.00 0.31 0.11
 ## q4 0.05 0.10 0.31 1.00 0.29
 ## q5 0.02 0.03 0.11 0.29 1.00
-
dM <- melt(dX, id.vars = "id")
-dProp <- dM[ , prop.table(table(value)), by = variable]
-dProp[, response := rep(seq(3), 5)]
-
-# probabilities
-dcast(dProp, variable ~ response, value.var = "V1", fill = 0)
+
dM <- melt(dX, id.vars = "id")
+dProp <- dM[ , prop.table(table(value)), by = variable]
+dProp[, response := rep(seq(3), 5)]
+
+# probabilities
+dcast(dProp, variable ~ response, value.var = "V1", fill = 0)
##    variable    1    2     3
 ## 1:       q1 0.20 0.10 0.692
 ## 2:       q2 0.70 0.20 0.099
diff --git a/inst/doc/simstudy.R b/inst/doc/simstudy.R
index 4cdbdc0..f3934d0 100644
--- a/inst/doc/simstudy.R
+++ b/inst/doc/simstudy.R
@@ -1,4 +1,7 @@
-## ---- echo = FALSE, message = FALSE-------------------------------------------
+## ----chunkname, echo=-1-------------------------------------------------------
+data.table::setDTthreads(2)
+
+## ----echo = FALSE, message = FALSE--------------------------------------------
 
 library(simstudy)
 library(data.table)
@@ -30,7 +33,7 @@ ggtheme <- function(panelback = "white") {
   
 }
 
-## ----  echo=FALSE-------------------------------------------------------------
+## ----echo=FALSE---------------------------------------------------------------
 def <- defData(varname="age", dist="normal", formula=10, variance = 2)
 def <- defData(def, varname="female", dist="binary", 
     formula="-2 + age * 0.1", link = "logit")
@@ -107,7 +110,7 @@ for (i in seq_along(age_effects)) {
 
 list_of_data
 
-## ----  echo=FALSE-------------------------------------------------------------
+## ----echo=FALSE---------------------------------------------------------------
 d <- list()
 d[[1]] <- data.table("beta", "mean", "both", "-", "dispersion", "X", "-", "X") 
 d[[2]] <- data.table("binary", "probability", "both", "-", "-", "X", "-", "X") 
@@ -169,7 +172,7 @@ dc <- defCondition(dc, condition = "x > 2", formula = "-5 + 4*x",
 dd <- genData(1000, d)
 dd <- addCondition(dc, dd, newvar = "y")
 
-## ---- fig.width = 5, fig.height = 3, echo=FALSE, message=FALSE----------------
+## ----fig.width = 5, fig.height = 3, echo=FALSE, message=FALSE-----------------
 ggplot(data = dd, aes(y = y, x = x)) +
   geom_point(color = " grey60", size = .5) +
   geom_smooth(se = FALSE, size = .5) +
diff --git a/inst/doc/simstudy.Rmd b/inst/doc/simstudy.Rmd
index 654a157..a97541b 100644
--- a/inst/doc/simstudy.Rmd
+++ b/inst/doc/simstudy.Rmd
@@ -7,6 +7,10 @@ vignette: >
   \usepackage[utf8]{inputenc}
 ---
 
+```{r chunkname, echo=-1}
+data.table::setDTthreads(2)
+```
+
 ```{r, echo = FALSE, message = FALSE}
 
 library(simstudy)
diff --git a/inst/doc/simstudy.html b/inst/doc/simstudy.html
index 5ca6b89..fe38f3c 100644
--- a/inst/doc/simstudy.html
+++ b/inst/doc/simstudy.html
@@ -425,12 +425,12 @@ 

Defining the Data

can be read in with a call to defRead. An alternative is to make repeated calls to the function defData. This script builds a definition table internally:

-
def <- defData(varname = "age", dist = "normal", formula = 10,
-    variance = 2)
-def <- defData(def, varname = "female", dist = "binary",
-    formula = "-2 + age * 0.1", link = "logit")
-def <- defData(def, varname = "visits", dist = "poisson",
-    formula = "1.5 - 0.2 * age + 0.5 * female", link = "log")
+
def <- defData(varname = "age", dist = "normal", formula = 10,
+    variance = 2)
+def <- defData(def, varname = "female", dist = "binary",
+    formula = "-2 + age * 0.1", link = "logit")
+def <- defData(def, varname = "visits", dist = "poisson",
+    formula = "1.5 - 0.2 * age + 0.5 * female", link = "log")

The data definition table includes a row for each variable that is to be generated, and has the following fields: varname*, formula, variance, dist, and @@ -462,10 +462,10 @@

Generating the data

example, 1,000 observations are generated using the data set definitions in def, and then stored in the object dd:

-
set.seed(87261)
-
-dd <- genData(1000, def)
-dd
+
set.seed(87261)
+
+dd <- genData(1000, def)
+dd
##         id   age female visits
 ##    1:    1  9.78      0      0
 ##    2:    2 10.81      0      0
@@ -480,7 +480,7 @@ 

Generating the data

## 1000: 1000 10.80 1 2

If no data definition is provided, a simple data set is created with just id’s.

-
genData(1000)
+
genData(1000)
##         id
 ##    1:    1
 ##    2:    2
@@ -503,9 +503,9 @@ 

Assigning treatment/exposure

generate more involved types of study designs. In particular, with trtAssign, balanced and stratified designs are possible.

-
study1 <- trtAssign(dd, n = 3, balanced = TRUE, strata = c("female"),
-    grpName = "rx")
-study1
+
study1 <- trtAssign(dd, n = 3, balanced = TRUE, strata = c("female"),
+    grpName = "rx")
+study1
##         id   age female visits rx
 ##    1:    1  9.78      0      0  3
 ##    2:    2 10.81      0      0  1
@@ -518,7 +518,7 @@ 

Assigning treatment/exposure

## 998: 998 6.84 0 1 1 ## 999: 999 9.28 0 2 1 ## 1000: 1000 10.80 1 2 3
-
study1[, .N, keyby = .(female, rx)]
+
study1[, .N, keyby = .(female, rx)]
##    female rx   N
 ## 1:      0  1 249
 ## 2:      0  2 248
@@ -548,26 +548,26 @@ 

Formulas

function of both age and female. Since age is the first row in the table, we had to use a scalar to define the mean.

-
def <- defData(varname = "age", dist = "normal", formula = 10,
-    variance = 2)
-def <- defData(def, varname = "female", dist = "binary",
-    formula = "-2 + age * 0.1", link = "logit")
-def <- defData(def, varname = "visits", dist = "poisson",
-    formula = "1.5 - 0.2 * age + 0.5 * female", link = "log")
+
def <- defData(varname = "age", dist = "normal", formula = 10,
+    variance = 2)
+def <- defData(def, varname = "female", dist = "binary",
+    formula = "-2 + age * 0.1", link = "logit")
+def <- defData(def, varname = "visits", dist = "poisson",
+    formula = "1.5 - 0.2 * age + 0.5 * female", link = "log")

Formulas can include R functions or user-defined functions. Here is an example with a user-defined function myinv and the log function from base R:

-
myinv <- function(x) {
-    1/x
-}
-
-def <- defData(varname = "age", formula = 10, variance = 2,
-    dist = "normal")
-def <- defData(def, varname = "loginvage", formula = "log(myinv(age))",
-    variance = 0.1, dist = "normal")
-
-genData(5, def)
+
myinv <- function(x) {
+    1/x
+}
+
+def <- defData(varname = "age", formula = 10, variance = 2,
+    dist = "normal")
+def <- defData(def, varname = "loginvage", formula = "log(myinv(age))",
+    variance = 0.1, dist = "normal")
+
+genData(5, def)
##    id   age loginvage
 ## 1:  1 10.31     -2.58
 ## 2:  2  7.90     -1.94
@@ -581,12 +581,12 @@ 

Formulas

of a data definition table. In this case, we are changing the formula of loginvage in def so that it uses the function log10 instead of log:

-
def10 <- updateDef(def, changevar = "loginvage", newformula = "log10(myinv(age))")
-def10
+
def10 <- updateDef(def, changevar = "loginvage", newformula = "log10(myinv(age))")
+def10
##      varname           formula variance   dist     link
 ## 1:       age                10        2 normal identity
 ## 2: loginvage log10(myinv(age))      0.1 normal identity
-
genData(5, def10)
+
genData(5, def10)
##    id   age loginvage
 ## 1:  1  9.82    -0.338
 ## 2:  2 10.97    -0.633
@@ -597,32 +597,32 @@ 

Formulas

dynamic definition tables is the external reference capability through the double-dot notation. Any variable reference in a formula that is preceded by “..” refers to an externally defined variable:

-
age_effect <- 3
-
-def <- defData(varname = "age", formula = 10, variance = 2,
-    dist = "normal")
-def <- defData(def, varname = "agemult", formula = "age * ..age_effect",
-    dist = "nonrandom")
-
-def
+
age_effect <- 3
+
+def <- defData(varname = "age", formula = 10, variance = 2,
+    dist = "normal")
+def <- defData(def, varname = "agemult", formula = "age * ..age_effect",
+    dist = "nonrandom")
+
+def
##    varname            formula variance      dist     link
 ## 1:     age                 10        2    normal identity
 ## 2: agemult age * ..age_effect        0 nonrandom identity
-
genData(2, def)
+
genData(2, def)
##    id  age agemult
 ## 1:  1 9.69    29.1
 ## 2:  2 9.63    28.9

But the real power of dynamic definition is seen in the context of iteration over multiple values:

-
age_effects <- c(0, 5, 10)
-list_of_data <- list()
-
-for (i in seq_along(age_effects)) {
-    age_effect <- age_effects[i]
-    list_of_data[[i]] <- genData(2, def)
-}
-
-list_of_data
+
age_effects <- c(0, 5, 10)
+list_of_data <- list()
+
+for (i in seq_along(age_effects)) {
+    age_effect <- age_effects[i]
+    list_of_data[[i]] <- genData(2, def)
+}
+
+list_of_data
## [[1]]
 ##    id  age agemult
 ## 1:  1 11.4       0
@@ -1026,14 +1026,14 @@ 

Generating multiple variables with a single definition

specified as they are in the defData function. Calls to defRepeat can be integrated with calls to defData.

-
def <- defRepeat(nVars = 4, prefix = "g", formula = "1/3;1/3;1/3",
-    variance = 0, dist = "categorical")
-def <- defData(def, varname = "a", formula = "1;1", dist = "trtAssign")
-def <- defRepeat(def, 3, "b", formula = "5 + a", variance = 3,
-    dist = "normal")
-def <- defData(def, "y", formula = "0.10", dist = "binary")
-
-def
+
def <- defRepeat(nVars = 4, prefix = "g", formula = "1/3;1/3;1/3",
+    variance = 0, dist = "categorical")
+def <- defData(def, varname = "a", formula = "1;1", dist = "trtAssign")
+def <- defRepeat(def, 3, "b", formula = "5 + a", variance = 3,
+    dist = "normal")
+def <- defData(def, "y", formula = "0.10", dist = "binary")
+
+def
##    varname     formula variance        dist     link
 ## 1:      g1 1/3;1/3;1/3        0 categorical identity
 ## 2:      g2 1/3;1/3;1/3        0 categorical identity
@@ -1067,26 +1067,26 @@ 

defDataAdd/defRepeatAdd/readDataAdd and addColumns

in these “add-ing” functions are permitted to refer to fields that exist in the data set to be augmented, so all variables need not be defined in the current definition able.

-
d1 <- defData(varname = "x1", formula = 0, variance = 1,
-    dist = "normal")
-d1 <- defData(d1, varname = "x2", formula = 0.5, dist = "binary")
-
-d2 <- defRepeatAdd(nVars = 2, prefix = "q", formula = "5 + 3*rx",
-    variance = 4, dist = "normal")
-d2 <- defDataAdd(d2, varname = "y", formula = "-2 + 0.5*x1 + 0.5*x2 + 1*rx",
-    dist = "binary", link = "logit")
-
-dd <- genData(5, d1)
-dd <- trtAssign(dd, nTrt = 2, grpName = "rx")
-dd
+
d1 <- defData(varname = "x1", formula = 0, variance = 1,
+    dist = "normal")
+d1 <- defData(d1, varname = "x2", formula = 0.5, dist = "binary")
+
+d2 <- defRepeatAdd(nVars = 2, prefix = "q", formula = "5 + 3*rx",
+    variance = 4, dist = "normal")
+d2 <- defDataAdd(d2, varname = "y", formula = "-2 + 0.5*x1 + 0.5*x2 + 1*rx",
+    dist = "binary", link = "logit")
+
+dd <- genData(5, d1)
+dd <- trtAssign(dd, nTrt = 2, grpName = "rx")
+dd
##    id      x1 x2 rx
 ## 1:  1 -1.3230  1  0
 ## 2:  2 -0.0494  0  1
 ## 3:  3 -0.4064  1  0
 ## 4:  4 -0.5562  1  0
 ## 5:  5 -0.0941  0  1
-
dd <- addColumns(d2, dd)
-dd
+
dd <- addColumns(d2, dd)
+dd
##    id      x1 x2 rx    q1    q2 y
 ## 1:  1 -1.3230  1  0 4.589  5.70 0
 ## 2:  2 -0.0494  0  1 9.829 11.74 1
@@ -1108,17 +1108,17 @@ 

defCondition and addCondition

variance, and link arguments.

In this example, the slope of a regression line of \(y\) on \(x\) varies depending on the value of the predictor \(x\):

-
d <- defData(varname = "x", formula = 0, variance = 9, dist = "normal")
-
-dc <- defCondition(condition = "x <= -2", formula = "4 + 3*x",
-    variance = 2, dist = "normal")
-dc <- defCondition(dc, condition = "x > -2 & x <= 2", formula = "0 + 1*x",
-    variance = 4, dist = "normal")
-dc <- defCondition(dc, condition = "x > 2", formula = "-5 + 4*x",
-    variance = 3, dist = "normal")
-
-dd <- genData(1000, d)
-dd <- addCondition(dc, dd, newvar = "y")
+
d <- defData(varname = "x", formula = 0, variance = 9, dist = "normal")
+
+dc <- defCondition(condition = "x <= -2", formula = "4 + 3*x",
+    variance = 2, dist = "normal")
+dc <- defCondition(dc, condition = "x > -2 & x <= 2", formula = "0 + 1*x",
+    variance = 4, dist = "normal")
+dc <- defCondition(dc, condition = "x > 2", formula = "-5 + 4*x",
+    variance = 3, dist = "normal")
+
+dd <- genData(1000, d)
+dd <- addCondition(dc, dd, newvar = "y")

diff --git a/inst/doc/spline.R b/inst/doc/spline.R index dd446ee..15c6adf 100644 --- a/inst/doc/spline.R +++ b/inst/doc/spline.R @@ -1,4 +1,7 @@ -## ---- echo = FALSE, message = FALSE------------------------------------------- +## ----chunkname, echo=-1------------------------------------------------------- +data.table::setDTthreads(2) + +## ----echo = FALSE, message = FALSE-------------------------------------------- library(simstudy) library(ggplot2) library(scales) @@ -29,7 +32,7 @@ ggtheme <- function(panelback = "white") { } -## ---- fig.width = 6, fig.height = 2.5----------------------------------------- +## ----fig.width = 6, fig.height = 2.5------------------------------------------ knots <- c(0.25, 0.50, 0.75) viewBasis(knots, degree = 2) @@ -37,7 +40,7 @@ knots <- c(0.20, 0.40, 0.60, 0.80) viewBasis(knots, degree = 3) -## ---- fig.width = 6, fig.height = 2.5----------------------------------------- +## ----fig.width = 6, fig.height = 2.5------------------------------------------ knots <- c(0.25, 0.5, 0.75) # number of elements in theta: length(knots) + degree + 1 @@ -61,7 +64,7 @@ ddef <- defData(varname = "age", formula = "20;60", dist = "uniform") theta1 = c(0.1, 0.8, 0.6, 0.4, 0.6, 0.9, 0.9) knots <- c(0.25, 0.5, 0.75) -## ---- fig.width = 6, fig.height = 2.5----------------------------------------- +## ----fig.width = 6, fig.height = 2.5------------------------------------------ viewSplines(knots = knots, theta = theta1, degree = 3) ## ----------------------------------------------------------------------------- @@ -74,14 +77,14 @@ dt <- genSpline(dt = dt, newvar = "weight", newrange = "90;160", noise.var = 64) -## ---- fig.width = 6, fig.height = 3, message = FALSE-------------------------- +## ----fig.width = 6, fig.height = 3, message = FALSE--------------------------- ggplot(data = dt, aes(x=age, y=weight)) + geom_point(color = "grey65", size = 0.75) + geom_smooth(se=FALSE, color="red", size = 1, method = "auto") + geom_vline(xintercept = quantile(dt$age, knots)) + theme(panel.grid.minor = element_blank()) -## ---- fig.width = 6, fig.height = 3------------------------------------------- +## ----fig.width = 6, fig.height = 3-------------------------------------------- # normalize age for best basis functions dt[, nage := (age - min(age))/(max(age) - min(age))] diff --git a/inst/doc/spline.Rmd b/inst/doc/spline.Rmd index 5a18e33..8d07ae4 100644 --- a/inst/doc/spline.Rmd +++ b/inst/doc/spline.Rmd @@ -7,6 +7,10 @@ vignette: > \usepackage[utf8]{inputenc} --- +```{r chunkname, echo=-1} +data.table::setDTthreads(2) +``` + ```{r, echo = FALSE, message = FALSE} library(simstudy) library(ggplot2) diff --git a/inst/doc/spline.html b/inst/doc/spline.html index f5a050a..f16375a 100644 --- a/inst/doc/spline.html +++ b/inst/doc/spline.html @@ -358,11 +358,11 @@

Spline Data

First, we can look at the basis functions, which depend only the knots and degree. The knots are specified as quantiles, between 0 and 1:

-
knots <- c(0.25, 0.50, 0.75)
-viewBasis(knots, degree = 2)
+
knots <- c(0.25, 0.50, 0.75)
+viewBasis(knots, degree = 2)

-
knots <- c(0.20, 0.40, 0.60, 0.80)
-viewBasis(knots, degree = 3)
+
knots <- c(0.20, 0.40, 0.60, 0.80)
+viewBasis(knots, degree = 3)

The splines themselves are specified as linear combinations of each of the basis functions. The coefficients of those combinations are @@ -371,19 +371,19 @@

Spline Data

exploring, we can look at a single curve or multiple curves, depending on whether or not we specify theta as a vector (single) or matrix (multiple).

-
knots <- c(0.25, 0.5, 0.75)
-
-# number of elements in theta: length(knots) + degree + 1
-theta1 = c(0.1, 0.8, 0.4, 0.9, 0.2, 1.0) 
-
-viewSplines(knots, degree = 2, theta1)
+
knots <- c(0.25, 0.5, 0.75)
+
+# number of elements in theta: length(knots) + degree + 1
+theta1 = c(0.1, 0.8, 0.4, 0.9, 0.2, 1.0) 
+
+viewSplines(knots, degree = 2, theta1)

-
theta2 = matrix(c(0.1, 0.2, 0.4, 0.9, 0.2, 0.3, 0.6, 
-                  0.1, 0.3, 0.3, 0.8, 1.0, 0.9, 0.4, 
-                  0.1, 0.9, 0.8, 0.2, 0.1, 0.6, 0.1),
-               ncol = 3)
-
-theta2
+
theta2 = matrix(c(0.1, 0.2, 0.4, 0.9, 0.2, 0.3, 0.6, 
+                  0.1, 0.3, 0.3, 0.8, 1.0, 0.9, 0.4, 
+                  0.1, 0.9, 0.8, 0.2, 0.1, 0.6, 0.1),
+               ncol = 3)
+
+theta2
##      [,1] [,2] [,3]
 ## [1,]  0.1  0.1  0.1
 ## [2,]  0.2  0.3  0.9
@@ -392,63 +392,63 @@ 

Spline Data

## [5,] 0.2 1.0 0.1 ## [6,] 0.3 0.9 0.6 ## [7,] 0.6 0.4 0.1
-
viewSplines(knots, degree = 3, theta2)
+
viewSplines(knots, degree = 3, theta2)

We can generate data using a predictor in an existing data set by specifying the knots (in terms of quantiles), a vector of coefficients in theta, the degree of the polynomial, as well as a range

-
ddef <- defData(varname = "age", formula = "20;60", dist = "uniform")
-
-theta1 = c(0.1, 0.8, 0.6, 0.4, 0.6, 0.9, 0.9)
-knots <- c(0.25, 0.5, 0.75)
+
ddef <- defData(varname = "age", formula = "20;60", dist = "uniform")
+
+theta1 = c(0.1, 0.8, 0.6, 0.4, 0.6, 0.9, 0.9)
+knots <- c(0.25, 0.5, 0.75)

Here is the shape of the curve that we want to generate data from:

-
viewSplines(knots = knots, theta = theta1, degree = 3)
+
viewSplines(knots = knots, theta = theta1, degree = 3)

Now we specify the variables in the data set and generate the data:

-
set.seed(234)
-
-dt <- genData(1000, ddef)
-dt <- genSpline(dt = dt, newvar = "weight",
-                predictor = "age", theta = theta1,
-                knots = knots, degree = 3,
-                newrange = "90;160",
-                noise.var = 64)
+
set.seed(234)
+
+dt <- genData(1000, ddef)
+dt <- genSpline(dt = dt, newvar = "weight",
+                predictor = "age", theta = theta1,
+                knots = knots, degree = 3,
+                newrange = "90;160",
+                noise.var = 64)

Here’s a plot of the data with a smoothed line fit to the data:

-
ggplot(data = dt, aes(x=age, y=weight)) +
-  geom_point(color = "grey65", size = 0.75) +
-  geom_smooth(se=FALSE, color="red", size = 1, method = "auto") +
-  geom_vline(xintercept = quantile(dt$age, knots)) +
-  theme(panel.grid.minor = element_blank())
+
ggplot(data = dt, aes(x=age, y=weight)) +
+  geom_point(color = "grey65", size = 0.75) +
+  geom_smooth(se=FALSE, color="red", size = 1, method = "auto") +
+  geom_vline(xintercept = quantile(dt$age, knots)) +
+  theme(panel.grid.minor = element_blank())

Finally, we will fit three different spline models to the data - a linear, a quadratic, and a cubic - and plot the predicted values:

-
# normalize age for best basis functions
-dt[, nage := (age - min(age))/(max(age) - min(age))] 
-
-# fit a cubic spline
-lmfit3 <- lm(weight ~ bs(x = nage, knots = knots, degree = 3, intercept = TRUE) - 1, data = dt)
-
-# fit a quadtratic spline
-lmfit2 <- lm(weight ~ bs(x = nage, knots = knots, degree = 2), data = dt)
-
-# fit a  linear spline
-lmfit1 <- lm(weight ~ bs(x = nage, knots = knots, degree = 1), data = dt)
-
-# add predicted values for plotting
-dt[, pred.3deg := predict(lmfit3)]
-dt[, pred.2deg := predict(lmfit2)]
-dt[, pred.1deg := predict(lmfit1)]
-
-ggplot(data = dt, aes(x=age, y=weight)) +
-  geom_point(color = "grey65", size = 0.75) +
-  geom_line(aes(x=age, y = pred.3deg), color = "#1B9E77", size = 1) +
-  geom_line(aes(x=age, y = pred.2deg), color = "#D95F02", size = 1) +
-  geom_line(aes(x=age, y = pred.1deg), color = "#7570B3", size = 1) +
-  geom_vline(xintercept = quantile(dt$age, knots)) +
-  theme(panel.grid.minor = element_blank())
+
# normalize age for best basis functions
+dt[, nage := (age - min(age))/(max(age) - min(age))] 
+
+# fit a cubic spline
+lmfit3 <- lm(weight ~ bs(x = nage, knots = knots, degree = 3, intercept = TRUE) - 1, data = dt)
+
+# fit a quadtratic spline
+lmfit2 <- lm(weight ~ bs(x = nage, knots = knots, degree = 2), data = dt)
+
+# fit a  linear spline
+lmfit1 <- lm(weight ~ bs(x = nage, knots = knots, degree = 1), data = dt)
+
+# add predicted values for plotting
+dt[, pred.3deg := predict(lmfit3)]
+dt[, pred.2deg := predict(lmfit2)]
+dt[, pred.1deg := predict(lmfit1)]
+
+ggplot(data = dt, aes(x=age, y=weight)) +
+  geom_point(color = "grey65", size = 0.75) +
+  geom_line(aes(x=age, y = pred.3deg), color = "#1B9E77", size = 1) +
+  geom_line(aes(x=age, y = pred.2deg), color = "#D95F02", size = 1) +
+  geom_line(aes(x=age, y = pred.1deg), color = "#7570B3", size = 1) +
+  geom_vline(xintercept = quantile(dt$age, knots)) +
+  theme(panel.grid.minor = element_blank())

diff --git a/inst/doc/survival.R b/inst/doc/survival.R index c5b455a..233350e 100644 --- a/inst/doc/survival.R +++ b/inst/doc/survival.R @@ -1,4 +1,7 @@ -## ---- echo = FALSE, message = FALSE------------------------------------------- +## ----chunkname, echo=-1------------------------------------------------------- +data.table::setDTthreads(2) + +## ----echo = FALSE, message = FALSE-------------------------------------------- library(simstudy) # library(ggplot2) library(scales) @@ -8,7 +11,7 @@ library(survival) library(gee) library(data.table) -## ---- echo = FALSE------------------------------------------------------------ +## ----echo = FALSE------------------------------------------------------------- plotcolors <- c("#B84226", "#1B8445", "#1C5974") cbbPalette <- c("#B84226","#B88F26", "#A5B435", "#1B8446", @@ -29,7 +32,7 @@ ggtheme <- function(panelback = "white") { } -## ---- tidy = TRUE------------------------------------------------------------- +## ----tidy = TRUE-------------------------------------------------------------- # Baseline data definitions @@ -46,7 +49,7 @@ sdef <- defSurv(sdef, varname = "censorTime", scale = 80, shape = 1) sdef -## ---- tidy = TRUE------------------------------------------------------------- +## ----tidy = TRUE-------------------------------------------------------------- # Baseline data definitions @@ -60,7 +63,7 @@ head(dtSurv) dtSurv[,round(mean(survTime),1), keyby = .(grp,x1)] -## ---- tidy = TRUE------------------------------------------------------------- +## ----tidy = TRUE-------------------------------------------------------------- dtSurv <- genData(300, def) dtSurv <- genSurv(dtSurv, sdef, timeName = "obsTime", censorName = "censorTime", eventName = "status", @@ -72,7 +75,7 @@ head(dtSurv) dtSurv[,round(1-mean(status),2), keyby = .(grp,x1)] -## ---- tidy = TRUE, echo = FALSE, fig.width = 6.5, fig.height = 3.5, warning=FALSE---- +## ----tidy = TRUE, echo = FALSE, fig.width = 6.5, fig.height = 3.5, warning=FALSE---- fit <- survfit(Surv(obsTime, status) ~ x1+grp, data=dtSurv) survminer::ggsurvplot(fit, data = dtSurv, @@ -83,7 +86,7 @@ survminer::ggsurvplot(fit, data = dtSurv, # panel.grid = ggplot2::element_blank()) ) -## ---- tidy = TRUE------------------------------------------------------------- +## ----tidy = TRUE-------------------------------------------------------------- # Baseline data definitions @@ -101,7 +104,7 @@ dtSurv <- genSurv(dtSurv, sdef, timeName = "obsTime", coxfit <- survival::coxph(Surv(obsTime, status) ~ x1 + x2, data = dtSurv) -## ---- echo=FALSE-------------------------------------------------------------- +## ----echo=FALSE--------------------------------------------------------------- gtsummary::tbl_regression(coxfit) ## ----------------------------------------------------------------------------- @@ -122,7 +125,7 @@ dtSurv <- addCompRisk(dtSurv, events = c("event_1", "event_2", "censor"), timeName = "time", censorName = "censor") dtSurv -## ---- tidy = TRUE, echo = FALSE, fig.width = 6.5, fig.height = 3.5, warning=FALSE---- +## ----tidy = TRUE, echo = FALSE, fig.width = 6.5, fig.height = 3.5, warning=FALSE---- fit <- survfit(Surv(time, event, type="mstate") ~ 1, data=dtSurv) survminer::ggcompetingrisks(fit, ggtheme = ggtheme("grey94")) + ggplot2::scale_fill_manual(values = cbbPalette) @@ -132,7 +135,7 @@ dtSurv <- genData(101, d1) dtSurv <- genSurv(dtSurv, dS, timeName = "time", censorName = "censor") dtSurv -## ---- tidy = TRUE------------------------------------------------------------- +## ----tidy = TRUE-------------------------------------------------------------- def <- defData(varname = "x", formula = .4, dist="binary") defS <- defSurv(varname = "death", formula = "-14.6 - 0.7*x", shape = .35) @@ -143,7 +146,7 @@ dd <- genSurv(dd, defS, digits = 2, timeName = "time", censorName = "censor") fit <- survfit( Surv(time, event) ~ x, data = dd ) -## ---- tidy = TRUE, echo = FALSE, fig.width = 6.5, fig.height = 3.5, warning=FALSE---- +## ----tidy = TRUE, echo = FALSE, fig.width = 6.5, fig.height = 3.5, warning=FALSE---- survminer::ggsurvplot(fit, data = dd, ggtheme = ggtheme("grey94"), palette = cbbPalette @@ -152,13 +155,13 @@ survminer::ggsurvplot(fit, data = dd, ## ----------------------------------------------------------------------------- coxfit <- coxph(formula = Surv(time, event) ~ x, data = dd) -## ---- echo=FALSE-------------------------------------------------------------- +## ----echo=FALSE--------------------------------------------------------------- gtsummary::tbl_regression(coxfit) ## ----------------------------------------------------------------------------- cox.zph(coxfit) -## ---- tidy = TRUE------------------------------------------------------------- +## ----tidy = TRUE-------------------------------------------------------------- def <- defData(varname = "x", formula = .4, dist="binary") defS <- defSurv(varname = "death", formula = "-14.6 - 1.3*x", shape = .35, transition = 0) @@ -170,7 +173,7 @@ dd <- genSurv(dd, defS, digits = 2, timeName = "time", censorName = "censor") fit <- survfit( Surv(time, event) ~ x, data = dd ) -## ---- tidy = TRUE, echo = FALSE, fig.width = 6.5, fig.height = 3.5, warning=FALSE---- +## ----tidy = TRUE, echo = FALSE, fig.width = 6.5, fig.height = 3.5, warning=FALSE---- survminer::ggsurvplot(fit, data = dd, ggtheme = ggtheme("grey94"), palette = cbbPalette @@ -179,7 +182,7 @@ survminer::ggsurvplot(fit, data = dd, ## ----------------------------------------------------------------------------- coxfit <- survival::coxph(formula = Surv(time, event) ~ x, data = dd) -## ---- echo=FALSE-------------------------------------------------------------- +## ----echo=FALSE--------------------------------------------------------------- gtsummary::tbl_regression(coxfit) ## ----------------------------------------------------------------------------- @@ -191,7 +194,7 @@ dd2 <- survSplit(Surv(time, event) ~ ., data= dd, cut=c(150), coxfit2 <- survival::coxph(Surv(tstart, time, event) ~ x:strata(tgroup), data=dd2) -## ---- echo=FALSE-------------------------------------------------------------- +## ----echo=FALSE--------------------------------------------------------------- gtsummary::tbl_regression(coxfit2) ## ----------------------------------------------------------------------------- @@ -202,7 +205,7 @@ points <- list(c(100, 0.80), c(200, 0.10)) r <- survGetParams(points) r -## ---- tidy = TRUE, fig.width = 6.5, fig.height = 3.5, warning=FALSE----------- +## ----tidy = TRUE, fig.width = 6.5, fig.height = 3.5, warning=FALSE------------ survParamPlot(f = r[1], shape = r[2], points) ## ----------------------------------------------------------------------------- @@ -213,7 +216,7 @@ defS <- defSurv(defS, varname = "censor", formula = 0, scale = exp(18.5), shape dd <- genData(500) dd <- genSurv(dd, defS, timeName = "time", censorName = "censor") -## ---- tidy = TRUE, echo = FALSE, fig.width = 6.5, fig.height = 3.5, warning=FALSE---- +## ----tidy = TRUE, echo = FALSE, fig.width = 6.5, fig.height = 3.5, warning=FALSE---- fit <- survfit( Surv(time, event) ~ 1, data = dd ) survminer::ggsurvplot(fit, data = dd, diff --git a/inst/doc/survival.Rmd b/inst/doc/survival.Rmd index c59f1b6..25b48ab 100644 --- a/inst/doc/survival.Rmd +++ b/inst/doc/survival.Rmd @@ -7,6 +7,9 @@ vignette: > \usepackage[utf8]{inputenc} --- +```{r chunkname, echo=-1} +data.table::setDTthreads(2) +``` ```{r, echo = FALSE, message = FALSE} diff --git a/inst/doc/survival.html b/inst/doc/survival.html index 43d4e27..01b8caa 100644 --- a/inst/doc/survival.html +++ b/inst/doc/survival.html @@ -382,31 +382,31 @@

Generating standard survival data with censoring

Here is an example showing how to generate data with covariates. In this case the scale and shape parameters will vary by group membership.

-
# Baseline data definitions
-
-def <- defData(varname = "x1", formula = 0.5, dist = "binary")
-def <- defData(def, varname = "grp", formula = 0.5, dist = "binary")
-
-# Survival data definitions
-
-set.seed(282716)
-
-sdef <- defSurv(varname = "survTime", formula = "1.5*x1", scale = "grp*50 + (1-grp)*25",
-    shape = "grp*1 + (1-grp)*1.5")
-sdef <- defSurv(sdef, varname = "censorTime", scale = 80, shape = 1)
-
-sdef
+
# Baseline data definitions
+
+def <- defData(varname = "x1", formula = 0.5, dist = "binary")
+def <- defData(def, varname = "grp", formula = 0.5, dist = "binary")
+
+# Survival data definitions
+
+set.seed(282716)
+
+sdef <- defSurv(varname = "survTime", formula = "1.5*x1", scale = "grp*50 + (1-grp)*25",
+    shape = "grp*1 + (1-grp)*1.5")
+sdef <- defSurv(sdef, varname = "censorTime", scale = 80, shape = 1)
+
+sdef
##       varname formula               scale               shape transition
 ## 1: censorTime       0                  80                   1          0
 ## 2:   survTime  1.5*x1 grp*50 + (1-grp)*25 grp*1 + (1-grp)*1.5          0

The data are generated with calls to genData and genSurv:

-
# Baseline data definitions
-
-dtSurv <- genData(300, def)
-dtSurv <- genSurv(dtSurv, sdef)
-
-head(dtSurv)
+
# Baseline data definitions
+
+dtSurv <- genData(300, def)
+dtSurv <- genSurv(dtSurv, sdef)
+
+head(dtSurv)
##    id x1 grp censorTime survTime
 ## 1:  1  0   0       42.9     9.88
 ## 2:  2  0   1       77.2    17.34
@@ -414,9 +414,9 @@ 

Generating standard survival data with censoring

## 4: 4 1 1 181.9 16.47 ## 5: 5 1 0 210.9 108.28 ## 6: 6 0 1 34.1 8.12
-
# A comparison of survival by group and x1
-
-dtSurv[, round(mean(survTime), 1), keyby = .(grp, x1)]
+
# A comparison of survival by group and x1
+
+dtSurv[, round(mean(survTime), 1), keyby = .(grp, x1)]
##    grp x1    V1
 ## 1:   0  0 149.2
 ## 2:   0  1  23.4
@@ -425,11 +425,11 @@ 

Generating standard survival data with censoring

Observed survival times and censoring indicators can be generated using the competing risk functionality and specifying a censoring variable:

-
dtSurv <- genData(300, def)
-dtSurv <- genSurv(dtSurv, sdef, timeName = "obsTime", censorName = "censorTime",
-    eventName = "status", keepEvents = TRUE)
-
-head(dtSurv)
+
dtSurv <- genData(300, def)
+dtSurv <- genSurv(dtSurv, sdef, timeName = "obsTime", censorName = "censorTime",
+    eventName = "status", keepEvents = TRUE)
+
+head(dtSurv)
##    id x1 grp censorTime survTime obsTime status       type
 ## 1:  1  0   1       92.4   49.071  49.071      1   survTime
 ## 2:  2  1   0       45.8   25.639  25.639      1   survTime
@@ -437,9 +437,9 @@ 

Generating standard survival data with censoring

## 4: 4 0 0 12.7 13.325 12.663 0 censorTime ## 5: 5 0 0 26.5 323.764 26.531 0 censorTime ## 6: 6 1 0 23.5 0.157 0.157 1 survTime
-
# estimate proportion of censoring by x1 and group
-
-dtSurv[, round(1 - mean(status), 2), keyby = .(grp, x1)]
+
# estimate proportion of censoring by x1 and group
+
+dtSurv[, round(1 - mean(status), 2), keyby = .(grp, x1)]
##    grp x1   V1
 ## 1:   0  0 0.71
 ## 2:   0  1 0.10
@@ -449,21 +449,21 @@ 

Generating standard survival data with censoring

Here is a survival analysis (using a Cox proportional hazard model) of a slightly simplified data set with two baseline covariates only:

-
# Baseline data definitions
-
-def <- defData(varname = "x1", formula = 0.5, dist = "binary")
-def <- defData(def, varname = "x2", formula = 0.5, dist = "binary")
-
-# Survival data definitions
-
-sdef <- defSurv(varname = "survTime", formula = "1.5*x1 - .8*x2", scale = 50, shape = 1/2)
-sdef <- defSurv(sdef, varname = "censorTime", scale = 80, shape = 1)
-
-dtSurv <- genData(300, def)
-dtSurv <- genSurv(dtSurv, sdef, timeName = "obsTime", censorName = "censorTime",
-    eventName = "status")
-
-coxfit <- survival::coxph(Surv(obsTime, status) ~ x1 + x2, data = dtSurv)
+
# Baseline data definitions
+
+def <- defData(varname = "x1", formula = 0.5, dist = "binary")
+def <- defData(def, varname = "x2", formula = 0.5, dist = "binary")
+
+# Survival data definitions
+
+sdef <- defSurv(varname = "survTime", formula = "1.5*x1 - .8*x2", scale = 50, shape = 1/2)
+sdef <- defSurv(sdef, varname = "censorTime", scale = 80, shape = 1)
+
+dtSurv <- genData(300, def)
+dtSurv <- genSurv(dtSurv, sdef, timeName = "obsTime", censorName = "censorTime",
+    eventName = "status")
+
+coxfit <- survival::coxph(Surv(obsTime, status) ~ x1 + x2, data = dtSurv)

The 95% confidence intervals of the parameter estimates include the values used to generate the data:

@@ -899,64 +899,64 @@

Competing risks

addCompRisk that will generate the competing risk information using an existing data set. The example here will take that approach.

-
d1 <- defData(varname = "x1", formula = .5, dist = "binary")
-d1 <- defData(d1, "x2", .5, dist = "binary")
-
-dS <- defSurv(varname = "event_1", formula = "-10 - 0.6*x1 + 0.4*x2", shape = 0.3)
-dS <- defSurv(dS, "event_2", "-6.5 + 0.3*x1 - 0.5*x2", shape = 0.5)
-dS <- defSurv(dS, "censor", "-7", shape = 0.55)
-
-dtSurv <- genData(1001, d1)
-dtSurv <- genSurv(dtSurv, dS)
-
-dtSurv
+
d1 <- defData(varname = "x1", formula = .5, dist = "binary")
+d1 <- defData(d1, "x2", .5, dist = "binary")
+
+dS <- defSurv(varname = "event_1", formula = "-10 - 0.6*x1 + 0.4*x2", shape = 0.3)
+dS <- defSurv(dS, "event_2", "-6.5 + 0.3*x1 - 0.5*x2", shape = 0.5)
+dS <- defSurv(dS, "censor", "-7", shape = 0.55)
+
+dtSurv <- genData(1001, d1)
+dtSurv <- genSurv(dtSurv, dS)
+
+dtSurv
##         id x1 x2 censor event_1 event_2
-##    1:    1  0  0  81.38   32.80   21.85
-##    2:    2  0  0  44.71   11.22   28.56
-##    3:    3  0  1   5.61   22.91   52.99
-##    4:    4  1  1  14.85   22.18    4.72
-##    5:    5  0  0  43.83   13.58   36.07
+##    1:    1  0  1  39.66    17.8   23.62
+##    2:    2  1  0 101.14    24.1   32.41
+##    3:    3  1  0  13.18    40.5   36.50
+##    4:    4  0  0  26.97    18.1   45.12
+##    5:    5  0  1  15.32    14.2   37.96
 ##   ---                                  
-##  997:  997  1  1  14.61   14.76   26.14
-##  998:  998  1  1  30.19    6.42   16.13
-##  999:  999  1  0   7.54   26.09   20.33
-## 1000: 1000  1  0  19.50   27.08    6.43
-## 1001: 1001  1  0  31.74   11.95   22.59
-
dtSurv <- addCompRisk(dtSurv, events = c("event_1", "event_2", "censor"), 
-            timeName = "time", censorName = "censor")
-dtSurv
+## 997: 997 0 0 7.04 16.5 28.58 +## 998: 998 0 0 25.29 19.8 51.29 +## 999: 999 0 0 45.01 14.9 29.27 +## 1000: 1000 1 0 11.11 15.4 7.78 +## 1001: 1001 1 0 69.18 24.3 18.32
+
dtSurv <- addCompRisk(dtSurv, events = c("event_1", "event_2", "censor"), 
+            timeName = "time", censorName = "censor")
+dtSurv
##         id x1 x2  time event    type
-##    1:    1  0  0 21.85     2 event_2
-##    2:    2  0  0 11.22     1 event_1
-##    3:    3  0  1  5.61     0  censor
-##    4:    4  1  1  4.72     2 event_2
-##    5:    5  0  0 13.58     1 event_1
+##    1:    1  0  1 17.78     1 event_1
+##    2:    2  1  0 24.09     1 event_1
+##    3:    3  1  0 13.18     0  censor
+##    4:    4  0  0 18.08     1 event_1
+##    5:    5  0  1 14.17     1 event_1
 ##   ---                               
-##  997:  997  1  1 14.61     0  censor
-##  998:  998  1  1  6.42     1 event_1
-##  999:  999  1  0  7.54     0  censor
-## 1000: 1000  1  0  6.43     2 event_2
-## 1001: 1001  1  0 11.95     1 event_1
+## 997: 997 0 0 7.04 0 censor +## 998: 998 0 0 19.84 1 event_1 +## 999: 999 0 0 14.89 1 event_1 +## 1000: 1000 1 0 7.78 2 event_2 +## 1001: 1001 1 0 18.32 2 event_2

The competing risk data can be plotted using the cumulative incidence functions (rather than the survival curves):

-

+

The data generation can all be done in two (instead of three) steps:

-
dtSurv <- genData(101, d1)
-dtSurv <- genSurv(dtSurv, dS, timeName = "time", censorName = "censor")
-dtSurv
-
##       id x1 x2  time event    type
-##   1:   1  0  1  4.83     1 event_1
-##   2:   2  0  1 22.42     1 event_1
-##   3:   3  1  0  9.16     1 event_1
-##   4:   4  1  0 25.13     1 event_1
-##   5:   5  1  0 10.51     1 event_1
-##  ---                              
-##  97:  97  0  0 13.88     2 event_2
-##  98:  98  0  1 19.53     1 event_1
-##  99:  99  0  0 15.10     0  censor
-## 100: 100  0  1 10.99     2 event_2
-## 101: 101  1  1 13.08     1 event_1
+
dtSurv <- genData(101, d1)
+dtSurv <- genSurv(dtSurv, dS, timeName = "time", censorName = "censor")
+dtSurv
+
##       id x1 x2 time event    type
+##   1:   1  0  1 11.9     1 event_1
+##   2:   2  0  0 20.4     1 event_1
+##   3:   3  1  1 14.5     2 event_2
+##   4:   4  1  0 12.2     2 event_2
+##   5:   5  1  1 16.6     1 event_1
+##  ---                             
+##  97:  97  1  1 17.9     2 event_2
+##  98:  98  1  0 15.8     2 event_2
+##  99:  99  1  1 20.2     1 event_1
+## 100: 100  0  0 20.2     1 event_1
+## 101: 101  1  0 14.9     1 event_1

Introducing non-proportional hazards

@@ -985,33 +985,33 @@

Introducing non-proportional hazards

Constant/proportional hazard ratio

To start, here is an example assuming a constant log hazard ratio of -0.7:

-
def <- defData(varname = "x", formula = 0.4, dist = "binary")
-
-defS <- defSurv(varname = "death", formula = "-14.6 - 0.7*x", shape = 0.35)
-defS <- defSurv(defS, varname = "censor", scale = exp(13), shape = 0.5)
-
-dd <- genData(500, def)
-dd <- genSurv(dd, defS, digits = 2, timeName = "time", censorName = "censor")
-
-fit <- survfit(Surv(time, event) ~ x, data = dd)
-

+
def <- defData(varname = "x", formula = 0.4, dist = "binary")
+
+defS <- defSurv(varname = "death", formula = "-14.6 - 0.7*x", shape = 0.35)
+defS <- defSurv(defS, varname = "censor", scale = exp(13), shape = 0.5)
+
+dd <- genData(500, def)
+dd <- genSurv(dd, defS, digits = 2, timeName = "time", censorName = "censor")
+
+fit <- survfit(Surv(time, event) ~ x, data = dd)
+

The Cox proportional hazards model recovers the correct log hazards rate:

-
coxfit <- coxph(formula = Surv(time, event) ~ x, data = dd)
-
- @@ -1399,8 +1399,8 @@

Introducing non-proportional hazards

x --0.72 --0.92, -0.52 +-0.69 +-0.88, -0.50 <0.001 @@ -1416,49 +1416,49 @@

Introducing non-proportional hazards

0.05\), then we would conclude that the assumption of proportional hazards is not warranted. In this case \(p = 0.22\), so the model is apparently reasonable:

-
cox.zph(coxfit)
+
cox.zph(coxfit)
##        chisq df    p
-## x       1.51  1 0.22
-## GLOBAL  1.51  1 0.22
+## x 2.61 1 0.11 +## GLOBAL 2.61 1 0.11


Non-constant/non-proportional hazard ratio

In this next case, the risk of death when \(x=1\) is lower at all time points compared to when \(x=0\), but the relative risk (or hazard ratio) changes at 150 days:

-
def <- defData(varname = "x", formula = 0.4, dist = "binary")
-
-defS <- defSurv(varname = "death", formula = "-14.6 - 1.3*x", shape = 0.35, transition = 0)
-defS <- defSurv(defS, varname = "death", formula = "-14.6 - 0.4*x", shape = 0.35,
-    transition = 150)
-defS <- defSurv(defS, varname = "censor", scale = exp(13), shape = 0.5)
-
-dd <- genData(500, def)
-dd <- genSurv(dd, defS, digits = 2, timeName = "time", censorName = "censor")
-
-fit <- survfit(Surv(time, event) ~ x, data = dd)
+
def <- defData(varname = "x", formula = 0.4, dist = "binary")
+
+defS <- defSurv(varname = "death", formula = "-14.6 - 1.3*x", shape = 0.35, transition = 0)
+defS <- defSurv(defS, varname = "death", formula = "-14.6 - 0.4*x", shape = 0.35,
+    transition = 150)
+defS <- defSurv(defS, varname = "censor", scale = exp(13), shape = 0.5)
+
+dd <- genData(500, def)
+dd <- genSurv(dd, defS, digits = 2, timeName = "time", censorName = "censor")
+
+fit <- survfit(Surv(time, event) ~ x, data = dd)

The survival curve for the sample with \(x=1\) has a slightly different shape under this data generation process compared to the previous example under a constant hazard ratio assumption; there is more separation early on (prior to day 150), and then the gap is closed at a quicker rate.

-

+

If we ignore the possibility that there might be a different relationship over time, the Cox proportional hazards model gives an estimate of the log hazard ratio quite close to -0.70:

-
coxfit <- survival::coxph(formula = Surv(time, event) ~ x, data = dd)
-
- @@ -1846,8 +1846,8 @@

Introducing non-proportional hazards

x --0.71 --0.90, -0.52 +-0.72 +-0.91, -0.52 <0.001 @@ -1861,33 +1861,33 @@

Introducing non-proportional hazards

However, further inspection of the proportionality assumption should make us question the appropriateness of the model. Since \(p<0.05\), we would be wise to see if we can improve on the model.

-
cox.zph(coxfit)
+
cox.zph(coxfit)
##        chisq df      p
-## x        7.4  1 0.0065
-## GLOBAL   7.4  1 0.0065
+## x 9.26 1 0.0023 +## GLOBAL 9.26 1 0.0023

We might be able to see from the plot where proportionality diverges, in which case we can split the data set into two parts at the identified time point. (In many cases, the transition point or points won’t be so obvious, in which case the investigation might be more involved.) By splitting the data at day 150, we get the desired estimates:

-
dd2 <- survSplit(Surv(time, event) ~ ., data= dd, cut=c(150),
-                 episode= "tgroup", id="newid")
-
-coxfit2 <- survival::coxph(Surv(tstart, time, event) ~ x:strata(tgroup), data=dd2)
-
- @@ -2275,17 +2275,17 @@

Introducing non-proportional hazards

x * strata(tgroup) - - - +
+
+
    x * tgroup=1 -1.3 --1.6, -0.93 +-1.7, -0.94 <0.001     x * tgroup=2 --0.38 --0.62, -0.13 -0.003 +-0.40 +-0.65, -0.16 +0.001 @@ -2297,10 +2297,10 @@

Introducing non-proportional hazards

And the diagnostic test of proportionality confirms the appropriateness of the model:

-
cox.zph(coxfit2)
-
##                  chisq df   p
-## x:strata(tgroup)  2.44  2 0.3
-## GLOBAL            2.44  2 0.3
+
cox.zph(coxfit2)
+
##                  chisq df    p
+## x:strata(tgroup)  0.57  2 0.75
+## GLOBAL            0.57  2 0.75

Generating parameters for survival distribution

@@ -2318,20 +2318,20 @@

Generating parameters for survival distribution

example, if we would like to find the parameters for a distribution where 80% survive until day 100, and 10% survive until day 200 (any number of points may be provided):

-
points <- list(c(100, 0.80), c(200, 0.10))
-r <- survGetParams(points)
-r
+
points <- list(c(100, 0.80), c(200, 0.10))
+r <- survGetParams(points)
+r
## [1] -17.007   0.297

We can visualize the curve that is defined by these parameters:

-
survParamPlot(f = r[1], shape = r[2], points)
+
survParamPlot(f = r[1], shape = r[2], points)

And we can generate data based on these parameters:

-
defS <- defSurv(varname = "death", formula = -17, scale = 1, shape = 0.3)
-defS <- defSurv(defS, varname = "censor", formula = 0, scale = exp(18.5), shape = 0.3)
-
-dd <- genData(500)
-dd <- genSurv(dd, defS, timeName = "time", censorName = "censor")
-

+
defS <- defSurv(varname = "death", formula = -17, scale = 1, shape = 0.3)
+defS <- defSurv(defS, varname = "censor", formula = 0, scale = exp(18.5), shape = 0.3)
+
+dd <- genData(500)
+dd <- genSurv(dd, defS, timeName = "time", censorName = "censor")
+

diff --git a/inst/doc/treat_and_exposure.R b/inst/doc/treat_and_exposure.R index b22ba46..db7ee6e 100644 --- a/inst/doc/treat_and_exposure.R +++ b/inst/doc/treat_and_exposure.R @@ -1,4 +1,7 @@ -## ---- echo = FALSE, message=FALSE--------------------------------------------- +## ----chunkname, echo=-1------------------------------------------------------- +data.table::setDTthreads(2) + +## ----echo = FALSE, message=FALSE---------------------------------------------- library(simstudy) library(ggplot2) @@ -66,18 +69,18 @@ def <- defData(def, varname = "baseDBP", dist = "normal", dtstudy <- genData(330, def) -## ---- tidy = TRUE------------------------------------------------------------- +## ----tidy = TRUE-------------------------------------------------------------- study1 <- trtAssign(dtstudy , n=3, balanced = TRUE, strata = c("male","over65"), grpName = "rxGrp") study1 -## ---- tidy = TRUE------------------------------------------------------------- +## ----tidy = TRUE-------------------------------------------------------------- study2 <- trtAssign(dtstudy , n=3, balanced = TRUE, grpName = "rxGrp") -## ---- tidy = TRUE------------------------------------------------------------- +## ----tidy = TRUE-------------------------------------------------------------- study3 <- trtAssign(dtstudy , n=3, balanced = FALSE, grpName = "rxGrp") -## ---- tidy = TRUE, echo = FALSE, fig.width = 4, fig.height = 6---------------- +## ----tidy = TRUE, echo = FALSE, fig.width = 4, fig.height = 6----------------- p1 <- splotfunc(study1, "Balanced within strata") p1a <- aplotfunc(study1, "") @@ -105,11 +108,11 @@ dtstudy ## ----------------------------------------------------------------------------- dtstudy[, .(n = .N, avg = round(mean(y), 1)), keyby = .(male, over65, rx)] -## ---- tidy = TRUE------------------------------------------------------------- +## ----tidy = TRUE-------------------------------------------------------------- formula1 <- c("-2 + 2*male - .5*over65", "-1 + 2*male + .5*over65") dtExp <- trtObserve(dtstudy, formulas = formula1, logit.link = TRUE, grpName = "exposure") -## ---- tidy = TRUE, echo = FALSE, fig.width = 6.5, fig.height = 2.5------------ +## ----tidy = TRUE, echo = FALSE, fig.width = 6.5, fig.height = 2.5------------- dtplot1 <- dtExp[,.N,keyby=.(male,exposure)] p1 <- ggplot(data = dtplot1, aes(x=factor(male), y=N)) + geom_bar(aes(fill=factor(exposure)), alpha = .8, stat="identity", position = "dodge") + @@ -137,12 +140,12 @@ p2 <- ggplot(data = dtplot2, aes(x=factor(over65), y=N)) + grid.arrange(p1,p2,nrow=1) -## ---- tidy = TRUE------------------------------------------------------------- +## ----tidy = TRUE-------------------------------------------------------------- formula2 <- c(.35, .45) dtExp2 <- trtObserve(dtstudy, formulas = formula2, logit.link = FALSE, grpName = "exposure") -## ---- tidy = TRUE, echo = FALSE, fig.width = 6.5, fig.height = 2.5------------ +## ----tidy = TRUE, echo = FALSE, fig.width = 6.5, fig.height = 2.5------------- dtplot1a <- dtExp2[,.N,keyby=.(male,exposure)] p1a <- ggplot(data = dtplot1a, aes(x=factor(male), y=N)) + geom_bar(aes(fill=factor(exposure)), alpha = .8, stat="identity", position = "dodge") + diff --git a/inst/doc/treat_and_exposure.Rmd b/inst/doc/treat_and_exposure.Rmd index 3420bf9..f19b3e0 100644 --- a/inst/doc/treat_and_exposure.Rmd +++ b/inst/doc/treat_and_exposure.Rmd @@ -7,6 +7,10 @@ vignette: > \usepackage[utf8]{inputenc} --- +```{r chunkname, echo=-1} +data.table::setDTthreads(2) +``` + ```{r, echo = FALSE, message=FALSE} library(simstudy) diff --git a/inst/doc/treat_and_exposure.html b/inst/doc/treat_and_exposure.html index 3b7efb6..0fb95e1 100644 --- a/inst/doc/treat_and_exposure.html +++ b/inst/doc/treat_and_exposure.html @@ -355,37 +355,37 @@

Assigned treatment

across treatment groups is not guaranteed, particularly with small sample sizes.

First, create the data definition:

-
def <- defData(varname = "male", dist = "binary", 
-               formula = .5 , id="cid")
-def <- defData(def, varname = "over65", dist = "binary", 
-               formula = "-1.7 + .8*male", link="logit")
-def <- defData(def, varname = "baseDBP", dist = "normal", 
-               formula = 70, variance = 40)
-
-dtstudy <- genData(330, def)
+
def <- defData(varname = "male", dist = "binary", 
+               formula = .5 , id="cid")
+def <- defData(def, varname = "over65", dist = "binary", 
+               formula = "-1.7 + .8*male", link="logit")
+def <- defData(def, varname = "baseDBP", dist = "normal", 
+               formula = 70, variance = 40)
+
+dtstudy <- genData(330, def)

Balanced treatment assignment, stratified by gender and age category (not blood pressure)

-
study1 <- trtAssign(dtstudy, n = 3, balanced = TRUE, strata = c("male", "over65"),
-    grpName = "rxGrp")
-
-study1
+
study1 <- trtAssign(dtstudy, n = 3, balanced = TRUE, strata = c("male", "over65"),
+    grpName = "rxGrp")
+
+study1
##      cid male over65 baseDBP rxGrp
-##   1:   1    1      0    74.6     1
-##   2:   2    0      0    66.9     2
-##   3:   3    1      1    64.8     2
-##   4:   4    0      0    63.8     2
-##   5:   5    1      0    55.2     3
+##   1:   1    1      1    64.9     2
+##   2:   2    1      0    67.0     2
+##   3:   3    0      1    75.0     3
+##   4:   4    0      0    70.9     2
+##   5:   5    1      0    68.6     1
 ##  ---                              
-## 326: 326    0      0    68.5     3
-## 327: 327    0      0    57.5     3
-## 328: 328    0      0    70.8     3
-## 329: 329    1      0    69.4     2
-## 330: 330    0      0    67.9     2
+## 326: 326 1 0 71.0 3 +## 327: 327 0 0 64.1 2 +## 328: 328 1 0 65.4 3 +## 329: 329 1 0 69.1 3 +## 330: 330 0 0 75.9 2

Balanced treatment assignment (without stratification)

-
study2 <- trtAssign(dtstudy, n = 3, balanced = TRUE, grpName = "rxGrp")
+
study2 <- trtAssign(dtstudy, n = 3, balanced = TRUE, grpName = "rxGrp")

Random (unbalanced) treatment assignment

-
study3 <- trtAssign(dtstudy, n = 3, balanced = FALSE, grpName = "rxGrp")
-

Comparison of three treatment assignment mechanisms

+
study3 <- trtAssign(dtstudy, n = 3, balanced = FALSE, grpName = "rxGrp")
+

Comparison of three treatment assignment mechanisms

Assigned treatment using trtAssign distribution in @@ -395,41 +395,41 @@

Assigned treatment using trtAssign distribution in randomization is stratified by gender and age, and the outcome y is effected by both of these factors as well as the treatment assignment variable rx.

-
def <- defData(varname = "male", dist = "binary", 
-               formula = .5 , id="cid")
-def <- defData(def, varname = "over65", dist = "binary",  
-               formula = "-1.7 + .8*male", link="logit")
-def <- defData(def, varname = "rx", dist = "trtAssign",
-               formula = "1;1", variance = "male;over65")
-def <- defData(def, varname = "y", dist = "normal", 
-               formula = "20 + 5*male + 10*over65 + 10*rx", variance = 40)
-
-dtstudy <- genData(330, def)
-dtstudy
+
def <- defData(varname = "male", dist = "binary", 
+               formula = .5 , id="cid")
+def <- defData(def, varname = "over65", dist = "binary",  
+               formula = "-1.7 + .8*male", link="logit")
+def <- defData(def, varname = "rx", dist = "trtAssign",
+               formula = "1;1", variance = "male;over65")
+def <- defData(def, varname = "y", dist = "normal", 
+               formula = "20 + 5*male + 10*over65 + 10*rx", variance = 40)
+
+dtstudy <- genData(330, def)
+dtstudy
##      cid male over65 rx    y
-##   1:   1    1      0  0 38.0
-##   2:   2    1      1  0 32.9
-##   3:   3    0      1  0 38.7
-##   4:   4    0      0  1 14.4
-##   5:   5    0      0  1 30.7
+##   1:   1    1      0  0 26.9
+##   2:   2    1      0  1 36.6
+##   3:   3    1      0  1 44.6
+##   4:   4    1      0  1 37.2
+##   5:   5    1      0  0 31.2
 ##  ---                        
-## 326: 326    1      1  0 31.2
-## 327: 327    1      1  0 36.8
-## 328: 328    1      0  1 38.6
-## 329: 329    1      0  1 33.1
-## 330: 330    0      1  0 21.5
+## 326: 326 0 0 0 28.4 +## 327: 327 0 0 0 25.1 +## 328: 328 1 0 0 29.8 +## 329: 329 0 0 1 28.2 +## 330: 330 1 0 0 22.3

Here are the counts and average outcomes for each gender, age, and treatment combination:

-
dtstudy[, .(n = .N, avg = round(mean(y), 1)), keyby = .(male, over65, rx)]
+
dtstudy[, .(n = .N, avg = round(mean(y), 1)), keyby = .(male, over65, rx)]
##    male over65 rx  n  avg
-## 1:    0      0  0 65 21.0
-## 2:    0      0  1 66 29.0
-## 3:    0      1  0 11 28.9
-## 4:    0      1  1 12 39.1
-## 5:    1      0  0 59 23.8
-## 6:    1      0  1 59 34.9
-## 7:    1      1  0 29 34.1
-## 8:    1      1  1 29 46.6
+## 1: 0 0 0 70 19.6 +## 2: 0 0 1 70 30.2 +## 3: 0 1 0 12 29.2 +## 4: 0 1 1 11 42.4 +## 5: 1 0 0 59 24.2 +## 6: 1 0 1 58 35.1 +## 7: 1 1 0 25 35.3 +## 8: 1 1 1 25 46.0

Observed treatment

@@ -439,19 +439,19 @@

Observed treatment

exposure to a specific level can depend on covariates already in the data set. In this case, there are three exposure groups that vary by gender and age:

-
formula1 <- c("-2 + 2*male - .5*over65", "-1 + 2*male + .5*over65")
-dtExp <- trtObserve(dtstudy, formulas = formula1, logit.link = TRUE, grpName = "exposure")
+
formula1 <- c("-2 + 2*male - .5*over65", "-1 + 2*male + .5*over65")
+dtExp <- trtObserve(dtstudy, formulas = formula1, logit.link = TRUE, grpName = "exposure")

Here are the exposure distributions by gender and age:

-

+

Here is a second case of three exposures where the exposure is independent of any covariates. Note that specifying the formula as c(.35, .45) is the same as specifying it is c(.35, .45, .20). Also, when referring to probabilities, the identity link is used:

-
formula2 <- c(0.35, 0.45)
-
-dtExp2 <- trtObserve(dtstudy, formulas = formula2, logit.link = FALSE, grpName = "exposure")
-

+
formula2 <- c(0.35, 0.45)
+
+dtExp2 <- trtObserve(dtstudy, formulas = formula2, logit.link = FALSE, grpName = "exposure")
+

Stepped-wedge design

@@ -474,16 +474,16 @@

Stepped-wedge design

We need to define the cluster-level variables (i.e. the cluster effect and the cluster size) as well as the individual specific outcome. In this case each cluster will have 15 individuals per period, and \(\sigma^2_b = 0.20\). In addition, \(\sigma^2_e = 1.75\).

-
library(simstudy)
-library(ggplot2)
-
-defc <- defData(varname = "ceffect", formula = 0, variance = 0.20, 
-                dist = "normal", id = "cluster")
-defc <- defData(defc, "m", formula = 15, dist = "nonrandom")
-
-defa <- defDataAdd(varname = "Y", 
-                   formula = "0 + ceffect + 0.1*period + trt*1.5", 
-                   variance = 1.75, dist = "normal")
+
library(simstudy)
+library(ggplot2)
+
+defc <- defData(varname = "ceffect", formula = 0, variance = 0.20, 
+                dist = "normal", id = "cluster")
+defc <- defData(defc, "m", formula = 15, dist = "nonrandom")
+
+defa <- defDataAdd(varname = "Y", 
+                   formula = "0 + ceffect + 0.1*period + trt*1.5", 
+                   variance = 1.75, dist = "normal")

In this case, there will be 30 clusters and 24 time periods. With 15 individuals per cluster per period, there will be 360 observations for each cluster, and 10,800 in total. (There is no reason the cluster sizes @@ -504,17 +504,17 @@

Stepped-wedge design

8, 12, 16, and 20.

Once the treatment assignments are made, the individual records are created and the outcome data are generated in the last step.

-
set.seed(608477)
-
-dc <- genData(30, defc)
-dp <- addPeriods(dc, 24, "cluster", timevarName = "t")
-dp <- trtStepWedge(dp, "cluster", nWaves = 5, lenWaves = 4, 
-          startPer = 4, grpName = "trt")
-
-dd <- genCluster(dp, cLevelVar = "timeID", "m", "id")
-dd <- addColumns(defa, dd)
-
-dd
+
set.seed(608477)
+
+dc <- genData(30, defc)
+dp <- addPeriods(dc, 24, "cluster", timevarName = "t")
+dp <- trtStepWedge(dp, "cluster", nWaves = 5, lenWaves = 4, 
+          startPer = 4, grpName = "trt")
+
+dd <- genCluster(dp, cLevelVar = "timeID", "m", "id")
+dd <- addColumns(defa, dd)
+
+dd
##        cluster period ceffect  m timeID startTrt trt    id      Y
 ##     1:       1      0  0.6278 15      1        4   0     1  1.524
 ##     2:       1      0  0.6278 15      1        4   0     2  0.986
@@ -527,16 +527,16 @@ 

Stepped-wedge design

## 10798: 30 23 -0.0983 15 720 20 1 10798 4.118 ## 10799: 30 23 -0.0983 15 720 20 1 10799 4.569 ## 10800: 30 23 -0.0983 15 720 20 1 10800 3.656
-
dSum <- dd[, .(Y = mean(Y)), keyby = .(cluster, period, trt, startTrt)]
-
-ggplot(data = dSum, 
-    aes(x = period, y = Y, group = interaction(cluster, trt))) +
-  geom_line(aes(color = factor(trt))) +
-  facet_grid(factor(startTrt, labels = c(1 : 5)) ~ .) +
-  scale_x_continuous(breaks = seq(0, 23, by = 4), name = "week") +
-  scale_color_manual(values = c("#b8cce4", "#4e81ba")) +
-  theme(panel.grid = element_blank(),
-        legend.position = "none") 
+
dSum <- dd[, .(Y = mean(Y)), keyby = .(cluster, period, trt, startTrt)]
+
+ggplot(data = dSum, 
+    aes(x = period, y = Y, group = interaction(cluster, trt))) +
+  geom_line(aes(color = factor(trt))) +
+  facet_grid(factor(startTrt, labels = c(1 : 5)) ~ .) +
+  scale_x_continuous(breaks = seq(0, 23, by = 4), name = "week") +
+  scale_color_manual(values = c("#b8cce4", "#4e81ba")) +
+  theme(panel.grid = element_blank(),
+        legend.position = "none") 

diff --git a/man/addCompRisk.Rd b/man/addCompRisk.Rd index 5c7f69a..219672c 100644 --- a/man/addCompRisk.Rd +++ b/man/addCompRisk.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/utility.R \name{addCompRisk} \alias{addCompRisk} -\title{Generating single competing risk survival varible} +\title{Generating single competing risk survival variable} \usage{ addCompRisk( dtName, @@ -46,7 +46,7 @@ to FALSE.} An updated data table } \description{ -Generating single competing risk survival varible +Generating single competing risk survival variable } \examples{ d1 <- defData(varname = "x1", formula = .5, dist = "binary") diff --git a/man/addCorGen.Rd b/man/addCorGen.Rd index 4b6d557..512823b 100644 --- a/man/addCorGen.Rd +++ b/man/addCorGen.Rd @@ -86,7 +86,6 @@ addCorGen( # Long example def <- defData(varname = "xbase", formula = 5, variance = .4, dist = "gamma", id = "cid") -def <- defData(def, "nperiods", formula = 3, dist = "noZeroPoisson") def2 <- defDataAdd( varname = "p", formula = "-3+.2*period + .3*xbase", @@ -98,6 +97,11 @@ dt <- genData(100, def) dtLong <- addPeriods(dt, idvars = "cid", nPeriods = 3) dtLong <- addColumns(def2, dtLong) +addCorGen( + dtOld = dtLong, idvar = "cid", nvars = NULL, rho = .7, corstr = "cs", + dist = "binary", param1 = "p" +) + } \references{ Emrich LJ, Piedmonte MR. A Method for Generating High-Dimensional diff --git a/man/genCorFlex.Rd b/man/genCorFlex.Rd index 0684b1d..6f04606 100644 --- a/man/genCorFlex.Rd +++ b/man/genCorFlex.Rd @@ -35,6 +35,7 @@ data.table with added column(s) of correlated data Create multivariate (correlated) data - for general distributions } \examples{ +\dontrun{ def <- defData(varname = "xNorm", formula = 0, variance = 4, dist = "normal") def <- defData(def, varname = "xGamma1", formula = 15, variance = 2, dist = "gamma") def <- defData(def, varname = "xBin", formula = 0.5, dist = "binary") @@ -52,4 +53,5 @@ cor(dt[, -"id"], method = "kendall") var(dt[, -"id"]) apply(dt[, -"id"], 2, mean) } +} \concept{correlated} diff --git a/man/logisticCoefs.Rd b/man/logisticCoefs.Rd new file mode 100644 index 0000000..ef031da --- /dev/null +++ b/man/logisticCoefs.Rd @@ -0,0 +1,86 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utility.R +\name{logisticCoefs} +\alias{logisticCoefs} +\title{Determine intercept, treatment/exposure and covariate coefficients that can +be used for binary data generation with a logit link and a set of covariates} +\usage{ +logisticCoefs( + defCovar, + coefs, + popPrev, + rr = NULL, + rd = NULL, + auc = NULL, + tolerance = 0.001, + sampleSize = 1e+05, + trtName = "A" +) +} +\arguments{ +\item{defCovar}{A definition table for the covariates in the underlying +population. This tables specifies the distribution of the covariates.} + +\item{coefs}{A vector of coefficients that reflect the relationship between +each of the covariates and the log-odds of the outcome.} + +\item{popPrev}{The target population prevalence of the outcome. +A value between 0 and 1.} + +\item{rr}{The target risk ratio, which must be a value between 0 and +1/popPrev. Defaults to NULL.} + +\item{rd}{The target risk difference, which must be between +-(popPrev) and (1 - popPrev). Defaults to NULL} + +\item{auc}{The target AUC, which must be a value between 0.5 and 1.0 . +Defaults to NULL.} + +\item{tolerance}{The minimum stopping distance between the adjusted low and high +endpoints. Defaults to 0.001.} + +\item{sampleSize}{The number of units to generate for the bisection algorithm. +The default is 1e+05. To get a reliable estimate, the value +should be no smaller than the default, though larger values can be used, though +computing time will increase.} + +\item{trtName}{If either a risk ratio or risk difference is the target statistic, +a treatment/exposure variable name can be provided. Defaults to "A".} +} +\value{ +A vector of parameters including the intercept and covariate +coefficients for the logistic model data generating process. +} +\description{ +This is an implementation of an iterative bisection procedure +that can be used to determine coefficient values for a target population +prevalence as well as a target risk ratio, risk difference, or AUC. These +coefficients can be used in a subsequent data generation process to simulate +data with these desire characteristics. +} +\details{ +If no specific target statistic is specified, then only the intercept +is returned along with the original coefficients. Only one target statistic (risk ratio, risk +difference or AUC) can be specified with a single function call; in all three cases, a target +prevalence is still required. +} +\examples{ +\dontrun{ +d1 <- defData(varname = "x1", formula = 0, variance = 1) +d1 <- defData(d1, varname = "b1", formula = 0.5, dist = "binary") + +coefs <- log(c(1.2, 0.8)) + +logisticCoefs(d1, coefs, popPrev = 0.20) +logisticCoefs(d1, coefs, popPrev = 0.20, rr = 1.50, trtName = "rx") +logisticCoefs(d1, coefs, popPrev = 0.20, rd = 0.30, trtName = "rx") +logisticCoefs(d1, coefs, popPrev = 0.20, auc = 0.80) +} +} +\references{ +Austin, Peter C. "The iterative bisection procedure: a useful +tool for determining parameter values in data-generating processes in +Monte Carlo simulations." BMC Medical Research Methodology 23, +no. 1 (2023): 1-10. +} +\concept{utility} diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 9d0e480..945ca99 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -97,6 +97,46 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// getBeta0 +double getBeta0(NumericVector lvec, double popPrev, double tolerance); +RcppExport SEXP _simstudy_getBeta0(SEXP lvecSEXP, SEXP popPrevSEXP, SEXP toleranceSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< NumericVector >::type lvec(lvecSEXP); + Rcpp::traits::input_parameter< double >::type popPrev(popPrevSEXP); + Rcpp::traits::input_parameter< double >::type tolerance(toleranceSEXP); + rcpp_result_gen = Rcpp::wrap(getBeta0(lvec, popPrev, tolerance)); + return rcpp_result_gen; +END_RCPP +} +// estAUC +double estAUC(NumericMatrix dmatrix, NumericVector y); +RcppExport SEXP _simstudy_estAUC(SEXP dmatrixSEXP, SEXP ySEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< NumericMatrix >::type dmatrix(dmatrixSEXP); + Rcpp::traits::input_parameter< NumericVector >::type y(ySEXP); + rcpp_result_gen = Rcpp::wrap(estAUC(dmatrix, y)); + return rcpp_result_gen; +END_RCPP +} +// getBeta_auc +NumericVector getBeta_auc(NumericMatrix covmat, NumericVector coefs, double auc, double popPrev, double tolerance); +RcppExport SEXP _simstudy_getBeta_auc(SEXP covmatSEXP, SEXP coefsSEXP, SEXP aucSEXP, SEXP popPrevSEXP, SEXP toleranceSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< NumericMatrix >::type covmat(covmatSEXP); + Rcpp::traits::input_parameter< NumericVector >::type coefs(coefsSEXP); + Rcpp::traits::input_parameter< double >::type auc(aucSEXP); + Rcpp::traits::input_parameter< double >::type popPrev(popPrevSEXP); + Rcpp::traits::input_parameter< double >::type tolerance(toleranceSEXP); + rcpp_result_gen = Rcpp::wrap(getBeta_auc(covmat, coefs, auc, popPrev, tolerance)); + return rcpp_result_gen; +END_RCPP +} static const R_CallMethodDef CallEntries[] = { {"_simstudy_matMultinom", (DL_FUNC) &_simstudy_matMultinom, 1}, @@ -106,6 +146,9 @@ static const R_CallMethodDef CallEntries[] = { {"_simstudy_checkBoundsBin", (DL_FUNC) &_simstudy_checkBoundsBin, 3}, {"_simstudy_findRhoBin", (DL_FUNC) &_simstudy_findRhoBin, 3}, {"_simstudy_getRhoMat", (DL_FUNC) &_simstudy_getRhoMat, 3}, + {"_simstudy_getBeta0", (DL_FUNC) &_simstudy_getBeta0, 3}, + {"_simstudy_estAUC", (DL_FUNC) &_simstudy_estAUC, 2}, + {"_simstudy_getBeta_auc", (DL_FUNC) &_simstudy_getBeta_auc, 5}, {NULL, NULL, 0} }; diff --git a/src/srcRcpp.cpp b/src/srcRcpp.cpp index 44073c6..d234179 100644 --- a/src/srcRcpp.cpp +++ b/src/srcRcpp.cpp @@ -1,9 +1,9 @@ #include +#include +#include // [[Rcpp::depends(pbv)]] -#include -#include using namespace std; using namespace Rcpp; @@ -206,3 +206,169 @@ NumericMatrix getRhoMat(int N, NumericVector P, NumericMatrix TCORR) { } +// [[Rcpp::export]] +double getBeta0(NumericVector lvec, double popPrev, double tolerance) { + + double intLow = -10; + double intHigh = 10; + double B0; + double PREV; + NumericVector ps(lvec.length()); + NumericVector nvec(lvec.length()); + + while(abs(intHigh - intLow) > tolerance){ + + B0 = (intLow + intHigh)/2; + for (int i = 0; i < lvec.length(); i++) { + nvec(i) = lvec(i) + B0; + } + ps = Rcpp::plogis(nvec); + PREV = mean(ps); + + if (PREV < popPrev) { + intLow = B0; + } else { + intHigh = B0; + } + + } + + B0 = (intLow + intHigh)/2; + + return(B0); + +} + + + +// [[Rcpp::export]] +double estAUC(NumericMatrix dmatrix, NumericVector y) { + + int aucN = 1000000; + + int N0 = y.length() - sum(y); + int N1 = sum(y); + + NumericVector P1(N1); + NumericVector P0(N0); + + // Obtaining namespace of fastglm package + Environment pkg = Environment::namespace_env("fastglm"); + Function f = pkg["fastglm"]; + Function p = pkg["predict.fastglm"]; + + // model should be include "Named("family", "binomial")", but occasionally + // throws off warning. Using Gaussian works very slightly less well, but + // there are no warnings + + List fit = f(Named("x", dmatrix), Named("y", y)); + NumericVector pred = p(Named("object", fit), Named("newdata", dmatrix)); + + int i1 = 0; + int i0 = 0; + + for (int i = 0; i < y.length(); i++) { + if ( y(i) == 1) { + P1[i1] = pred(i); + i1++; + } else { + P0[i0] = pred(i); + i0++; + } + } + + NumericVector s1 = P1[floor(Rcpp::runif(aucN, 0, N1))]; + NumericVector s0 = P0[floor(Rcpp::runif(aucN, 0, N0))]; + + double n_greater = 0; + + for (int i = 0; i < aucN; i++) { + if (s1(i) > s0(i)) n_greater++; + } + + double AUC = n_greater/aucN; + + return(AUC); + +} + +// [[Rcpp::export]] +NumericVector getBeta_auc(NumericMatrix covmat, NumericVector coefs, double auc, + double popPrev, double tolerance) { + + Environment base("package:base"); + Function mat_Mult = base["%*%"]; + + int N = covmat.nrow(); + + double intLow = 0; + double intHigh = 40; + + double alpha; + double B0; + double aStat; + NumericVector ps(N); + NumericVector lvec(N); + NumericVector avec(coefs.length()); + NumericVector avecint(coefs.length() + 1); + + NumericVector onevec(N); + NumericVector y(N); + + NumericVector results(2); + + for (int i = 0; i < N; i++) { + onevec(i) = 1; + } + + NumericMatrix dmatrix = cbind(onevec, covmat); + + lvec = mat_Mult(covmat, coefs); + B0 = getBeta0(lvec, popPrev, tolerance); + avecint(0) = B0; + + aStat = 0; + + while(abs(aStat - auc) > tolerance) { + + alpha = (intLow + intHigh)/2; + + for (int i = 0; i < coefs.length(); i++) { + avecint(i+1) = alpha * coefs(i); + } + + lvec = mat_Mult(dmatrix, avecint); + ps = Rcpp::plogis(lvec); + + for (int i=0; i(rbinom(1, 1, ps(i))); + } + + aStat = estAUC(dmatrix, y); + + // Rcout << auc << " " << alpha << " " << aStat << "\n"; + + if (aStat < auc) { + intLow = alpha; + } else { + intHigh = alpha; + } + + } + + alpha = (intLow + intHigh)/2; + + for (int i = 0; i < coefs.length(); i++) { + avec(i) = alpha * coefs(i); + } + + lvec = mat_Mult(covmat, avec); + B0 = getBeta0(lvec, popPrev, tolerance); + + results(0) = B0; + results(1) = alpha; + + return(results); + +} + diff --git a/tests/testthat.R b/tests/testthat.R index db8aa28..eb06433 100644 --- a/tests/testthat.R +++ b/tests/testthat.R @@ -2,5 +2,5 @@ library(testthat) library(hedgehog) library(simstudy) - +data.table::setDTthreads(2) # added to solve CRAN issue test_check("simstudy") diff --git a/tests/testthat/test-add_data.R b/tests/testthat/test-add_data.R index d25af7d..c96b58a 100644 --- a/tests/testthat/test-add_data.R +++ b/tests/testthat/test-add_data.R @@ -1,11 +1,13 @@ # addCondition ---- test_that("addCondition throws errors.", { + skip_on_cran() expect_error(addCondition(), class = "simstudy::missingArgument") expect_error(addCondition("a"), class = "simstudy::missingArgument") expect_error(addCondition(data.frame(), data.frame(), "a"), class = "simstudy::wrongClass") }) test_that("addCondition works.", { + skip_on_cran() def <- defData(varname = "x", formula = "1;10", dist = "uniformInt") defC <- defCondition(condition = "x >= 5", formula = "x + 5", dist = "nonrandom") defC <- defCondition(defC, condition = "x < 5", formula = "10", dist = "nonrandom") @@ -20,12 +22,14 @@ test_that("addCondition works.", { # addColumns ---- test_that("addColumns throws errors.", { + skip_on_cran() expect_error(addColumns(), class = "simstudy::missingArgument") expect_error(addColumns("a"), class = "simstudy::missingArgument") expect_error(addColumns(data.frame(), data.frame()), class = "simstudy::wrongClass") }) test_that("addColumns works.", { + skip_on_cran() def <- defData(varname = "x", formula = "1;10", dist = "uniformInt") dt <- genData(100, def) def2 <- defDataAdd(varname = "y", formula = "2.3 * (1/x)", dist = "normal") @@ -34,6 +38,7 @@ test_that("addColumns works.", { }) test_that("defRepeatAdd works", { + skip_on_cran() expect_silent( defRepeatAdd(nVars = 4, prefix = "g", formula = "1/3;1/3;1/3", variance = 0, dist = "categorical") ) @@ -47,6 +52,7 @@ test_that("defRepeatAdd works", { }) test_that("defRepeatAdd throws errors correctly.", { + skip_on_cran() expect_error(defRepeatAdd(prefix = "b", formula = 5, variance = 3, dist = "normal"), class = "simstudy::missingArgument" ) @@ -60,6 +66,7 @@ test_that("defRepeatAdd throws errors correctly.", { # addMarkov ---- test_that("addMarkov throws errors.", { + skip_on_cran() d0 <- defData(varname = "xx", formula = 2) d0 <- defData(d0, varname = "xy", formula = 5) dd <- genData(n = 10, dt = d0) @@ -94,6 +101,7 @@ test_that("addMarkov throws errors.", { # addSynthetic ---- test_that("addSynthetic throws errors.", { + skip_on_cran() ### Create fake "real" data set @@ -125,6 +133,7 @@ test_that("addSynthetic throws errors.", { }) test_that("addSynthetic works.", { + skip_on_cran() ### Create fake 'external' data set 'A' diff --git a/tests/testthat/test-conditions.R b/tests/testthat/test-conditions.R index d969f76..91c1ca9 100644 --- a/tests/testthat/test-conditions.R +++ b/tests/testthat/test-conditions.R @@ -1,4 +1,5 @@ test_that("conditions have correct class.", { + skip_on_cran() expect_error(stop(condition(c("error", "custom_Error"), "This is a custom error")), class = c("error", "custom_Error") ) @@ -11,6 +12,7 @@ test_that("conditions have correct class.", { }) test_that("pluralization works.", { + skip_on_cran() expect_error(argMissingError("arg1"), "argument is missing", class = "simstudy::missingArgument" ) diff --git a/tests/testthat/test-define_data.R b/tests/testthat/test-define_data.R index 088dc71..717c819 100644 --- a/tests/testthat/test-define_data.R +++ b/tests/testthat/test-define_data.R @@ -5,6 +5,8 @@ freeze_eval <- names(.GlobalEnv) # defData ---- test_that("defData throws errors", { + skip_on_cran() + expect_error(defData(dist = "nonrandom", formula = 7, id = "idnum"), class = "simstudy::missingArgument") expect_error(defData(varname = "xnr", dist = "nonrandom", id = "idnum"), class = "simstudy::missingArgument") }) @@ -13,30 +15,32 @@ test_that("defData throws errors", { # .evalDef ---- -test_that("checks combine in .evalDef correctly", { - skip_on_cran() - - # this generates 20 previously defined varnames. - gen_defVars <- gen.and_then(gen.int(20), gen_varnames) - - gen_evalDef_call <- - gen.and_then(gen_defVars, function(defVars) { - generate(for (i in gen_dist) { - list( - newvar = defVars[1], - newform = get(reg[name == i]$formula)(defVars[-1]), - newdist = i, - variance = get(reg[name == i]$variance)(defVars[-1]), - link = get(reg[name == i]$link), - defVars = defVars[-1] - ) - }) - }) - - forall(gen_evalDef_call, function(args) expect_silent(do.call(.evalDef, args))) -}) +# test_that("checks combine in .evalDef correctly", { +# skip_on_cran() +# +# # this generates 20 previously defined varnames. +# gen_defVars <- gen.and_then(gen.int(20), gen_varnames) +# +# gen_evalDef_call <- +# gen.and_then(gen_defVars, function(defVars) { +# generate(for (i in gen_dist) { +# list( +# newvar = defVars[1], +# newform = get(reg[name == i]$formula)(defVars[-1]), +# newdist = i, +# variance = get(reg[name == i]$variance)(defVars[-1]), +# link = get(reg[name == i]$link), +# defVars = defVars[-1] +# ) +# }) +# }) +# +# forall(gen_evalDef_call, function(args) expect_silent(do.call(.evalDef, args))) +# }) test_that(".evalDef throws errors correctly.", { + skip_on_cran() + expect_error(.evalDef(newvar = 1, "1 + 2", "normal", 0, "identiy", ""), class = "simstudy::wrongType") expect_error(.evalDef(newvar = c("a", "b"), "1 + 2", "normal", 0, "identiy", ""), class = "simstudy::lengthMismatch") expect_error(.evalDef(newvar = "varname", "1 + 2", "not valid", 0, "identiy", ""), class = "simstudy::optionInvalid") @@ -47,23 +51,26 @@ test_that(".evalDef throws errors correctly.", { }) # .isValidArithmeticFormula ---- -test_that("g.a.e. formula checked correctly.", { - skip_on_cran() - gen_gae <- - gen.and_then(gen_varnames(8), function(ns) { - gen.map(function(y) { - list( - defVars = ns, formula = y - ) - }, gen_formula(ns)) - }) - - forall(gen_gae, function(x) { - expect_silent(.isValidArithmeticFormula(x$formula, x$defVars)) - }) -}) +# test_that("g.a.e. formula checked correctly.", { +# skip_on_cran() +# +# gen_gae <- +# gen.and_then(gen_varnames(8), function(ns) { +# gen.map(function(y) { +# list( +# defVars = ns, formula = y +# ) +# }, gen_formula(ns)) +# }) +# +# forall(gen_gae, function(x) { +# expect_silent(.isValidArithmeticFormula(x$formula, x$defVars)) +# }) +# }) test_that(".isValidArithmeticFormula throws errors correctly.", { + skip_on_cran() + expect_error(.isValidArithmeticFormula(""), class = "simstudy::noValue") expect_error(.isValidArithmeticFormula("a;3"), class = "simstudy::valueError") expect_error(.isValidArithmeticFormula("1+3-"), class = "simstudy::valueError") @@ -74,6 +81,7 @@ test_that(".isValidArithmeticFormula throws errors correctly.", { # .checkMixture ---- test_that("'mixture' formula checked correctly", { skip_on_cran() + gen_mix_vars <- gen.choice(gen_dotdot_num, gen_varname, gen.element(-100:100)) gen_mix_vForm <- gen.sized(function(n) { gen.and_then(gen.c(gen_mix_vars, of = n), function(p) { @@ -89,6 +97,8 @@ test_that("'mixture' formula checked correctly", { }) test_that(".checkMixture throws errors.", { + skip_on_cran() + expect_error(.checkMixture("nr | .5 + a "), "same amount") expect_error(.checkMixture("nr | be"), "Probabilities can only be") }) @@ -96,34 +106,40 @@ test_that(".checkMixture throws errors.", { # .checkCategorical ---- test_that("'categorical' formula checked correctly", { skip_on_cran() + forall(gen_cat_probs, function(f) { expect_silent(.checkCategorical(genCatFormula(f))) }) }) test_that(".checkCategorical throws errors.", { + skip_on_cran() + expect_error(.checkCategorical("1"), "two numeric") }) # .checkUniform ---- -test_that("'uniform' formula checked correctly", { - skip_on_cran() - forall( - gen.and_then(gen_varnames(10), function(names) { - generate(for (x in list( - min = gen_formula(names), - max = gen_formula(names) - )) { - paste0(x$min, ";", x$max) - }) - }), - function(r) { - expect_silent(.checkUniform(r)) - } - ) -}) +# test_that("'uniform' formula checked correctly", { +# skip_on_cran() +# +# forall( +# gen.and_then(gen_varnames(10), function(names) { +# generate(for (x in list( +# min = gen_formula(names), +# max = gen_formula(names) +# )) { +# paste0(x$min, ";", x$max) +# }) +# }), +# function(r) { +# expect_silent(.checkUniform(r)) +# } +# ) +# }) test_that(".checkUniform throws errors.", { + skip_on_cran() + expect_error(.checkUniform(""), "format") expect_error(.checkUniform("1;2;3"), "format") }) @@ -132,6 +148,8 @@ test_that(".checkUniform throws errors.", { # .isIdLog ---- # .isIdLogit ---- test_that("'link' checked as expected", { + skip_on_cran() + expect_silent(.isIdLog("identity")) expect_silent(.isIdLog("log")) expect_silent(.isIdLogit("identity")) @@ -148,6 +166,8 @@ test_that("'link' checked as expected", { # .isDotArr ---- # .splitFormula ---- test_that("utility functions work", { + skip_on_cran() + names <- c("..as", "..bs", "..cs[4]", "..ds[x]") res <- c("as", "bs", "cs[4]", "ds[x]") @@ -161,6 +181,8 @@ test_that("utility functions work", { }) test_that("defRepeat works.", { + skip_on_cran() + expect_silent( defRepeat(nVars = 4, prefix = "g", formula = "1/3;1/3;1/3", variance = 0, dist = "categorical") ) @@ -172,6 +194,8 @@ test_that("defRepeat works.", { }) test_that("defRepeat throws errors correctly.", { + skip_on_cran() + expect_error(defRepeat(prefix = "b", formula = 5, variance = 3, dist = "normal"), class = "simstudy::missingArgument" ) diff --git a/tests/testthat/test-glue.R b/tests/testthat/test-glue.R index 5978295..8ae2088 100644 --- a/tests/testthat/test-glue.R +++ b/tests/testthat/test-glue.R @@ -1,4 +1,5 @@ test_that("Blocks are collapsed as expected.", { + skip_on_cran() nums <- 1:3 num <- 23 expect_equal( @@ -14,6 +15,7 @@ test_that("Blocks are collapsed as expected.", { }) test_that("numbers are formated as expected.", { + skip_on_cran() nums <- c(1.23, 0.556, 1 / 3) ints <- c(1, 2, 3) expect_equal(glueFmt("{nums:.2f}"), as.character(round(nums, 2))) @@ -22,6 +24,7 @@ test_that("numbers are formated as expected.", { }) test_that("numbers are collapsed and formated correctly.", { + skip_on_cran() ints <- c(1, 2, 3) expect_equal(glueFmtC("{ints:02d}"), "01, 02 and 03") expect_equal(glueFmtC("{2:.1f}"), "2.0") diff --git a/tests/testthat/test-group_data.R b/tests/testthat/test-group_data.R index f96e7f4..28a56aa 100644 --- a/tests/testthat/test-group_data.R +++ b/tests/testthat/test-group_data.R @@ -1,5 +1,6 @@ # .addStrataCode ---- test_that("strata codes are added as expected.", { + skip_on_cran() def <- defData(varname = "male", dist = "binary", formula = .5, id = "cid") def <- defData(def, varname = "over65", dist = "binary", formula = "-1.7 + .8*male", link = "logit") def <- defData(def, varname = "baseDBP", dist = "normal", formula = 70, variance = 40) @@ -14,6 +15,7 @@ test_that("strata codes are added as expected.", { # .stratSamp ---- test_that("stratified samples are drawn correctly.", { + skip_on_cran() expect_length(.stratSamp(1, 2), 1) expect_length(.stratSamp(2, 4), 2) expect_length(.stratSamp(50, 3), 50) diff --git a/tests/testthat/test-internal_utility.R b/tests/testthat/test-internal_utility.R index 75842f5..b7e2747 100644 --- a/tests/testthat/test-internal_utility.R +++ b/tests/testthat/test-internal_utility.R @@ -1,5 +1,6 @@ # .parseDotVars ---- test_that("dotVars are parsed correctly.", { + skip_on_cran() extVar1 <- 23 extVar2 <- 42 res <- list(..extVar1 = 23, ..extVar2 = 42) @@ -12,6 +13,7 @@ test_that("dotVars are parsed correctly.", { }) test_that("variables from different environments are parsed correctly.", { + skip_on_cran() extVar3 <- 7 env1 <- new.env() env2 <- new.env(parent = env1) @@ -27,6 +29,7 @@ test_that("variables from different environments are parsed correctly.", { # .evalWith ---- test_that("evalWith throws errors.", { + skip_on_cran() df <- data.frame() ext <- list(formula2parse = 2) @@ -35,6 +38,7 @@ test_that("evalWith throws errors.", { }) test_that("evalWith output length is correct.", { + skip_on_cran() df <- data.frame(a = rep.int(5, 5)) ext <- list(..ev = 2) @@ -43,6 +47,7 @@ test_that("evalWith output length is correct.", { }) test_that("evalWith output is Matrix.", { + skip_on_cran() df <- data.frame(a = rep.int(5, 5)) ext <- list(..ev = 2) @@ -74,11 +79,13 @@ test_that("probabilities (matrix) are adjusted as documented.", { # .getDists ---- test_that("number of Dists is up to date.", { + skip_on_cran() expect_length(.getDists(), 16) }) # .isFormulaScalar ---- test_that("isFormularScalar works correctly.", { + skip_on_cran() expect_true(.isFormulaScalar("5 + 3")) expect_true(.isFormulaScalar(5 + 3)) @@ -89,6 +96,7 @@ test_that("isFormularScalar works correctly.", { # .isValidVarName ---- test_that("var names are validated correctly.", { + skip_on_cran() validNames <- c("var1", "name", "name2", "var1") wrongNames <- c("...", "..1", "..5") @@ -103,6 +111,7 @@ test_that("var names are validated correctly.", { # .isError ---- test_that("errors are detected correctly.", { + skip_on_cran() err <- try(nonVar + 4, silent = TRUE) noErr <- try(3 + 5, silent = TRUE) @@ -114,6 +123,7 @@ test_that("errors are detected correctly.", { # .hasValue ---- test_that("hasValue works.", { + skip_on_cran() expect_true(.hasValue("value")) expect_true((function(x) .hasValue(x))(5)) expect_true((function(x) .hasValue(x))(NA)) @@ -125,6 +135,7 @@ test_that("hasValue works.", { # .log2Prob ---- test_that("log odds are converted correctly.", { + skip_on_cran() prob <- 0.2 logOdds <- log(0.25) diff --git a/tests/testthat/test-missing_data.R b/tests/testthat/test-missing_data.R index 878a2eb..8637e5a 100644 --- a/tests/testthat/test-missing_data.R +++ b/tests/testthat/test-missing_data.R @@ -31,7 +31,7 @@ test_that("genMiss works", { def1 <- defData(def1, "x2", dist = "normal", formula = "20*m + 20*u", variance = 2) def1 <- defData(def1, "x3", dist = "normal", formula = "20*m + 20*u", variance = 2) - dtAct1 <- genData(5000, def1) + dtAct1 <- genData(10000, def1) hardProbForm <- runif(1) form1val0 <- runif(1) diff --git a/tests/testthat/test-survival.R b/tests/testthat/test-survival.R index 11bb162..467e217 100644 --- a/tests/testthat/test-survival.R +++ b/tests/testthat/test-survival.R @@ -1,8 +1,10 @@ test_that("defSurv kicks out transition error", { + skip_on_cran() expect_error(defSurv(varname = "censor", formula = "-7", shape = 0.55, transition = 150)) }) test_that("genSurv runs OK", { + skip_on_cran() dS <- defSurv(varname = "event_1", formula = "-10", shape = 0.3) dS <- defSurv(dS, "event_2", "-6.5", shape = 0.4) dS <- defSurv(dS, "event_3", "-7", shape = 0.5) @@ -28,6 +30,7 @@ test_that("genSurv runs OK", { }) test_that("genSurv throws errors", { + skip_on_cran() dS <- defSurv(varname = "event_1", formula = "-10", shape = 0.3) dS <- defSurv(dS, "event_2", "-6.5", shape = 0.4) dS <- defSurv(dS, "event_3", "-7", shape = 0.5) @@ -38,6 +41,7 @@ test_that("genSurv throws errors", { }) test_that("addCmpRisk works", { + skip_on_cran() dS <- defSurv(varname = "event_1", formula = "-10", shape = 0.3) dS <- defSurv(dS, "event_2", "-6.5", shape = 0.4) diff --git a/tests/testthat/test-utility.R b/tests/testthat/test-utility.R index faaee48..76ac259 100644 --- a/tests/testthat/test-utility.R +++ b/tests/testthat/test-utility.R @@ -44,6 +44,7 @@ test_that("probabilities (vector) are adjusted as documented.", { }) test_that("genCatFormula throws errors.", { + skip_on_cran() expect_error(genCatFormula(), "Need to specify") expect_error(genCatFormula(1, 2, 3, n = 5), "or n, not both") expect_error(genCatFormula(1.1), "must be less than 1") @@ -53,16 +54,19 @@ test_that("genCatFormula throws errors.", { # betaGetShapes ---- test_that("betaGetShapes throws errors.", { + skip_on_cran() expect_error(betaGetShapes(1, 12), class = "simstudy::valueError") expect_error(betaGetShapes(.5, -5), class = "simstudy::valueError") }) test_that("betaGetShapes works.", { + skip_on_cran() expect_equal(betaGetShapes(.4, 5), list(shape1 = .4 * 5, shape2 = (1 - .4) * 5)) }) # genMixFormula ---- test_that("genMixFormula throws errors.", { + skip_on_cran() expect_error(genMixFormula(), class = "simstudy::missingArgument") expect_error(genMixFormula("a", varLength = 3), class = "simstudy::valueError") expect_error(genMixFormula("..a", varLength = "b"), class = "simstudy::wrongType") @@ -72,6 +76,7 @@ test_that("genMixFormula throws errors.", { }) test_that("genMixFormula works.", { + skip_on_cran() expect_equal(genMixFormula("a"), "a | 1") expect_equal(genMixFormula(c("a", "..b"), c(.3, .7)), "a | 0.3 + ..b | 0.7") expect_equal( @@ -83,6 +88,7 @@ test_that("genMixFormula works.", { # survGetParams ---- test_that("survGetParams throws errors.", { + skip_on_cran() expect_error(survGetParams(), class = "simstudy::missingArgument") expect_error(survGetParams(c(100, .5)), class = "simstudy::wrongClass") points <- list(c(280, 0.85), c(165, .45)) @@ -96,6 +102,7 @@ test_that("survGetParams throws errors.", { }) test_that("survGetParam works.", { + skip_on_cran() points <- list(c(50, 0.90), c(100, 0.10)) expect_equal(survGetParams(points), c(-19.658, 0.225), tolerance = .001) points <- list(c(60, 0.90), c(100, .75), c(200, .25), c(250, .10)) @@ -105,12 +112,14 @@ test_that("survGetParam works.", { # plotSurv ---- test_that("survParamPlot throws errors.", { + skip_on_cran() expect_error(survParamPlot(), class = "simstudy::missingArgument") expect_error(survParamPlot(formula = -10), class = "simstudy::missingArgument") expect_error(survParamPlot(formula = 4, shape = -1), class = "simstudy::wrongSign") }) test_that("survParamPlot works.", { + skip_on_cran() expect_is(survParamPlot(formula = -4, shape = 1), class = "ggplot") points <- list(c(100, .8), c(200, .5)) @@ -122,3 +131,105 @@ test_that("survParamPlot works.", { class = "ggplot" ) }) + +# logisticCoefs + +# test_that("logisticCoefs works.", { +# +# skip_on_cran() +# +# d1 <- defData(varname = "x1", formula = 0, variance = 1) +# d1 <- defData(d1, varname = "b1", formula = 0.5, dist = "binary") +# +# coefs <- log(runif(2, min = .8, max = 1.2)) +# +# ### Prevalence +# +# d1a <- defData(d1, varname = "y", +# formula = "t(..B) %*% c(1, x1, b1)", +# dist = "binary", link = "logit" +# ) +# +# tPop <- round(runif(1, .2, .5), 2) +# B <- logisticCoefs(defCovar = d1, coefs = coefs, popPrev = tPop) +# +# dd <- genData(100000, d1a) +# expect_equal(dd[, mean(y)], tPop, tolerance = .025) +# +# #### Comparisons +# +# d1a <- defData(d1, varname = "rx", formula = "1;1", dist = "trtAssign") +# d1a <- defData(d1a, varname = "y", +# formula = "t(..B) %*% c(1, rx, x1, b1)", +# dist = "binary", link = "logit" +# ) +# +# ### Risk ratio +# +# rr <- runif(1, .1, 1/tPop) +# B <- logisticCoefs(d1, coefs, popPrev = tPop, rr = rr, trtName = "rx") +# +# dd <- genData(100000, d1a) +# expect_equal(dd[rx==0, mean(y)], tPop, tolerance = .025) +# expect_equal(dd[rx==1, mean(y)]/dd[rx==0, mean(y)], rr, tolerance = 0.025) +# +# ### risk difference +# +# rd <- runif(1, -tPop, 1 - tPop) +# B <- logisticCoefs(d1, coefs, popPrev = tPop, rd = rd, trtName = "rx") +# +# dd <- genData(100000, d1a) +# expect_equal(dd[rx==0, mean(y)], tPop, tolerance = .025) +# expect_equal(dd[rx==1, mean(y)] - dd[rx==0, mean(y)], rd, tolerance = 0.025) +# +# ### AUC +# +# d1a <- defData(d1, varname = "y", +# formula = "t(..B) %*% c(1, x1, b1)", +# dist = "binary", link = "logit" +# ) +# +# auc <- runif(1, 0.6, 0.95) +# B <- logisticCoefs(d1, coefs, popPrev = tPop, auc = auc) +# +# dx <- genData(500000, d1a) +# expect_equal(dx[, mean(y)], tPop, tolerance = .025) +# +# form <- paste("y ~", paste(d1[, varname], collapse = " + ")) +# +# fit <- stats::glm(stats::as.formula(form), data = dx) +# dx[, py := stats::predict(fit)] +# +# Y1 <- dx[y == 1, sample(py, 1000000, replace = TRUE)] +# Y0 <- dx[y == 0, sample(py, 1000000, replace = TRUE)] +# aStat <- mean(Y1 > Y0) +# +# expect_equal(aStat, auc, tolerance = 0.025) +# +# }) + +test_that("logisticCoefs throws errors.", { + + skip_on_cran() + + d1 <- defData(varname = "x1", formula = 0, variance = 1) + d1 <- defData(d1, varname = "b1", formula = 0.5, dist = "binary") + + coefs <- log(runif(2, min = .8, max = 1.2)) + coefs2 <- log(runif(1, min = .8, max = 1.2)) + coefs3 <- c("a", "b") + + expect_error(logisticCoefs(d1, coefs), class = "simstudy::missingArgument") + expect_error(logisticCoefs(coef = coefs, popPrev = .5), class = "simstudy::missingArgument") + expect_error(logisticCoefs(defCovar = d1, popPrev = .5), class = "simstudy::missingArgument") + expect_error(logisticCoefs(d1, coefs, popPrev = .5, rr = 1.1, rd = .4), class = "simpleError") + expect_error(logisticCoefs(d1, coefs=coefs2, popPrev = .5), class = "simstudy::lengthMismatch" ) + expect_error(logisticCoefs(d1, coefs=coefs, popPrev = .5, rr = -1), class = "simstudy::minError" ) + expect_error(logisticCoefs(d1, coefs=coefs, popPrev = .5, rr = 2.1), class = "simpleError" ) + expect_error(logisticCoefs(d1, coefs=coefs, popPrev = .5, rd = .6), class = "simstudy::valueError" ) + expect_error(logisticCoefs(d1, coefs=coefs, popPrev = .5, rd = -.7), class = "simstudy::valueError" ) + expect_error(logisticCoefs(d1, coefs=coefs, popPrev = .5, auc = .4), class = "simstudy::valueError" ) + expect_error(logisticCoefs(d1, coefs=coefs, popPrev = 1.2), class = "simstudy::valueError" ) + expect_error(logisticCoefs(d1, coefs=coefs3, popPrev = .4), class = "simstudy::wrongType" ) + +}) diff --git a/vignettes/clustered.Rmd b/vignettes/clustered.Rmd index 7e9ad7a..144f02a 100644 --- a/vignettes/clustered.Rmd +++ b/vignettes/clustered.Rmd @@ -7,6 +7,10 @@ vignette: > \usepackage[utf8]{inputenc} --- +```{r chunkname, echo=-1} +data.table::setDTthreads(2) +``` + ```{r, echo = FALSE, message = FALSE} library(simstudy) library(ggplot2) diff --git a/vignettes/corelationmat.Rmd b/vignettes/corelationmat.Rmd index 816af70..60f2e07 100644 --- a/vignettes/corelationmat.Rmd +++ b/vignettes/corelationmat.Rmd @@ -7,6 +7,9 @@ vignette: > \usepackage[utf8]{inputenc} --- +```{r chunkname, echo=-1} +data.table::setDTthreads(2) +``` ```{r, echo = FALSE, message = FALSE} library(simstudy) diff --git a/vignettes/correlated.Rmd b/vignettes/correlated.Rmd index 9fd6ffe..5e0b48e 100644 --- a/vignettes/correlated.Rmd +++ b/vignettes/correlated.Rmd @@ -7,6 +7,10 @@ vignette: > \usepackage[utf8]{inputenc} --- +```{r chunkname, echo=-1} +data.table::setDTthreads(2) +``` + ```{r, echo = FALSE, message = FALSE} library(simstudy) library(ggplot2) diff --git a/vignettes/double_dot_extension.Rmd b/vignettes/double_dot_extension.Rmd index 32a1c82..0ce7df5 100644 --- a/vignettes/double_dot_extension.Rmd +++ b/vignettes/double_dot_extension.Rmd @@ -7,6 +7,10 @@ vignette: > \usepackage[utf8]{inputenc} --- +```{r chunkname, echo=-1} +data.table::setDTthreads(2) +``` + ```{r, echo = FALSE, message = FALSE} options(digits = 3) @@ -196,7 +200,7 @@ d1 <- defData(d1, varname = "outcome", formula = "..effect[a, b]", dist="nonrand ``` ```{r} -dx <- genData(8, d1) +dx <- genData(1000, d1) dx ``` @@ -215,7 +219,7 @@ dsum <- dx[, .(outcome=mean(outcome)), keyby = .(a, b)] ggplot(data = dx, aes(x = factor(a), y = outcome)) + geom_jitter(aes(color = factor(b)), width = .2, alpha = .4, size = .2) + geom_point(data = dsum, size = 2, aes(color = factor(b))) + - geom_line(data = dsum, size = 1, aes(color = factor(b), group = factor(b))) + + geom_line(data = dsum, linewidth = 1, aes(color = factor(b), group = factor(b))) + scale_color_manual(values = cbbPalette, name = " b") + theme(panel.grid = element_blank()) + xlab ("a") diff --git a/vignettes/logisticCoefs.Rmd b/vignettes/logisticCoefs.Rmd new file mode 100644 index 0000000..7e8c843 --- /dev/null +++ b/vignettes/logisticCoefs.Rmd @@ -0,0 +1,248 @@ +--- +title: "Targeted logistic model coefficients" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Logistic Model Simulation} + %\VignetteEngine{knitr::rmarkdown} + \usepackage[utf8]{inputenc} +--- + +```{r chunkname, echo=-1} +data.table::setDTthreads(2) +``` + +```{r, echo = FALSE, message = FALSE} +library(simstudy) +library(ggplot2) +library(scales) +library(grid) +library(gridExtra) +library(data.table) + +options(digits = 2) +``` + +In `simstudy`, there are at least two ways to define a binary data generating process. The first is to operate on the scale of the proportion or probability using the *identity* link. This allows users to define a data generating process that reflects assumptions about risk ratios and risk differences when comparing two groups defined by an exposure or treatment. However, this process can become challenging when introducing other covariates, because it can be difficult to constrain the probabilities so that they fall between 0 and 1. + +The second approach works on the log-odds scale using a *logit* link, and is much more amenable to accommodating covariates. Unfortunately, this comes at the price of being able to easily generate specific risk ratios and risk differences, because all parameters are log-odds ratios. The overall (marginal) prevalence of an outcome in a population will vary depending on the distribution of covariates in that population, and the strengths (both absolute and relative) of the association of those covariates with the outcome. That is, the coefficients of a logistic model (including the intercept) determine the prevalence. The same is true regarding the risk ratio and risk difference (if there is one particular exposure or treatment of interest) and the AUC. + +Here we start with the simplest case where we have a target marginal proportion or prevalence, and then illustrate data generation with three other target statistics: **risk ratios**, **risk differences**, and **AUCs**. + +### Prevalence + +In this first example, we start with one set of assumptions for four covariates $x_1, x2 \sim N(0, 1)$, $b_1 \sim Bin(0.3)$, and $b_2 \sim Bin(0.7)$, and generate the outcome *y* with the following data generating process: + +$$ \text{logit}(y) = 0.15x_1 + 0.25x_2 + 0.10b_1 + 0.30b_2$$ +
+ +```{r} +library(simstudy) +library(ggplot2) +library(data.table) + +coefs1 <- c(0.15, 0.25, 0.10, 0.30) + +d1 <- defData(varname = "x1", formula = 0, variance = 1) +d1 <- defData(d1, varname = "x2", formula = 0, variance = 1) +d1 <- defData(d1, varname = "b1", formula = 0.3, dist = "binary") +d1 <- defData(d1, varname = "b2", formula = 0.7, dist = "binary") + +d1a <- defData(d1, varname = "y", + formula = "t(..coefs1) %*% c(x1, x2, b1, b2)", + dist = "binary", link = "logit") + +set.seed(48392) + +dd <- genData(500000, d1a) +dd +``` + +The overall proportion of $y=1$ in this case is + +```{r} +dd[, mean(y)] +``` + +If we have a desired marginal proportion of 0.40, then we can add an intercept of -0.66 to the data generating process: + +$$ \text{logit}(y) = -0.66 + 0.15x_1 + 0.25x_2 + 0.10b_1 + 0.30b_2$$ + +The simulation now gives us the desired target: + +```{r} +d1a <- defData(d1, varname = "y", + formula = "t(c(-0.66, ..coefs1)) %*% c(1, x1, x2, b1, b2)", + dist = "binary", link = "logit") + +genData(500000, d1a)[, mean(y)] +``` + +If we change the distribution of the covariates, so that $x_1 \sim N(1, 1)$, $x_2 \sim N(2, 1)$, $b_1 \sim Bin(0.5)$, and $b_2 \sim Bin(0.8)$, and the strength of the association of these covariates with the outcome so that + +$$ \text{logit}(y) = 0.20x_1 + 0.35x_2 + 0.20b_1 + 0.45b_2,$$ + +the marginal proportion/prevalence (assuming no intercept term) also changes, going from 0.56 to 0.84: + +```{r} +coefs2 <- c(0.20, 0.35, 0.20, 0.45) + +d2 <- defData(varname = "x1", formula = 1, variance = 1) +d2 <- defData(d2, varname = "x2", formula = 3, variance = 1) +d2 <- defData(d2, varname = "b1", formula = 0.5, dist = "binary") +d2 <- defData(d2, varname = "b2", formula = 0.8, dist = "binary") + +d2a <- defData(d2, varname = "y", + formula = "t(..coefs2) %*% c(x1, x2, b1, b2)", + dist = "binary", link = "logit") + +genData(500000, d2a)[, mean(y)] +``` + +But under this new distribution, adding an intercept of -2.13 yields the desired target. + +$$ \text{logit}(y) = -2.13 + 0.20x_1 + 0.35x_2 + 0.20b_1 + 0.45b_2 $$ + +
+ +```{r} +d2a <- defData(d2, varname = "y", + formula = "t(c(-2.13, ..coefs2)) %*% c(1, x1, x2, b1, b2)", + dist = "binary", link = "logit") + +genData(500000, d1a)[, mean(y)] +``` + +#### Finding the intercept + +Where did those two intercepts come from? The [paper](https://bmcmedresmethodol.biomedcentral.com/articles/10.1186/s12874-023-01836-5){target="_blank"} by Peter Austin describes an iterative bisection procedure that takes a distribution of covariates and a set of coefficients to identify the intercept coefficient that yields the target marginal proportion or prevalence. + +The general idea of the algorithm is to try out series of different intercepts in an intelligent way that ends up at the right spot. (If you want the details for the algorithm, take a look at the [paper](https://bmcmedresmethodol.biomedcentral.com/articles/10.1186/s12874-023-01836-5){target="_blank"}.) The starting search range is pre-defined (we've used -10 to 10 for the intercept), and we start with an value of 0 for the initial intercept and simulate a large data set (the paper uses 1 million observations, but 100,000 seems to work just fine) and record the population prevalence. If we've overshot the target prevalence, we turn our attention to the range between -10 and 0, taking the average, which is -5. Otherwise, we focus on the range between 0 and 10. We iterate this way, choosing the range we need to focus on and setting the intercept at the mid-point (hence the name *bisection*). The algorithm will converge pretty quickly on the value of the intercept that gives the target population prevalence for the underlying covariate distribution and coefficient assumptions. + +In the current implementation in `simstudy`, the intercept is provided by a simple call to `logisticCoefs`. Here are the calls for the two sets of definitions in definition tables *d1* and *d2*. + +```{r} +logisticCoefs(defCovar = d1, coefs = coefs1, popPrev = 0.40) +logisticCoefs(defCovar = d2, coefs = coefs2, popPrev = 0.40) +``` + +### Risk ratios + +Just as the prevalence depends on the distribution of covariates and their association with the outcome, risk ratios comparing the outcome probabilities for two groups also depend on the additional covariates. The marginal risk ratio comparing treatment ($A =1$ to control ($A=0$) (given the distribution of covariates) is + +$$RR = \frac{P(y=1 | A = 1)}{P(y=1 | A = 0)}$$ +In the data generation process we use a log-odds ratio of -0.40 (odds ratio of approximately 0.67) in both cases, but we get different risk ratios (0.82 vs. 0.93), depending on the covariates (defined in *d1* and *d2*). + +```{r} +d1a <- defData(d1, varname = "rx", formula = "1;1", dist = "trtAssign") +d1a <- defData(d1a, varname = "y", + formula = "t(c(-0.40, ..coefs1)) %*% c(rx, x1, x2, b1, b2)", + dist = "binary", link = "logit" +) + +dd <- genData(500000, d1a) +dd[rx==1, mean(y)]/dd[rx==0, mean(y)] +``` + +```{r} +d2a <- defData(d2, varname = "rx", formula = "1;1", dist = "trtAssign") +d2a <- defData(d2a, varname = "y", + formula = "t(c(-0.40, ..coefs2)) %*% c(rx, x1, x2, b1, b2)", + dist = "binary", link = "logit" +) + +dd <- genData(500000, d2a) +dd[rx==1, mean(y)]/dd[rx==0, mean(y)] +``` + +By specifying both a population prevalence and a target risk ratio in the call to `logisticCoefs`, we can get the necessary parameters. When specifying the target risk ratio, it is required to be between 0 and 1/popPrev. A risk ratio cannot be negative, and the probability of the outcome under treatment cannot exceed 1 (which will happen if the risk ratio is greater than 1/popPrev). + +```{r} +C1 <- logisticCoefs(d1, coefs1, popPrev = 0.40, rr = 0.85) +C1 +``` + +If we use $C_1$ in the data generation process, we will get a data set with the desired target prevalence and risk ratio: + +```{r} +d1a <- defData(d1, varname = "rx", formula = "1;1", dist = "trtAssign") +d1a <- defData(d1a, varname = "y", + formula = "t(..C1) %*% c(1, rx, x1, x2, b1, b2)", + dist = "binary", link = "logit" +) + +dd <- genData(500000, d1a) +``` + +Here are the prevalence and risk ratio: + +```{r} +dd[rx==0, mean(y)] +dd[rx==1, mean(y)]/dd[rx==0, mean(y)] +``` + +You can do the same for the second set of assumptions. + +### Risk differences + +Risk differences have the same set of issues, and are handled in the same way. The risk difference is defined as + +$$ RD = P(y=1 | A = 1) - P(y=1 | A = 0)$$ + +To get the coefficients related to a population prevalence of 0.40 and risk difference of -0.15 (so that the proportion in the exposure arm is 0.25), we use the *rd* argument: + +```{r} +C1 <- logisticCoefs(d1, coefs1, popPrev = 0.40, rd = -0.15) +C1 +``` + +Again, using $C_1$ in the data generation process, we will get a data set with the desired target prevalence and risk difference: + +```{r} +d1a <- defData(d1, varname = "rx", formula = "1;1", dist = "trtAssign") +d1a <- defData(d1a, varname = "y", + formula = "t(..C1) %*% c(1, rx, x1, x2, b1, b2)", + dist = "binary", link = "logit" +) + +dd <- genData(500000, d1a) + +dd[rx==0, mean(y)] +dd[rx==1, mean(y)] - dd[rx==0, mean(y)] +``` + +### AUC + +The AUC is another commonly used statistic to evaluate a logistic model. We can use `logisticCoefs` to find the parameters that will allow us to generate data from a model with a specific AUC. To get the coefficients related to a population prevalence of 0.40 and an AUC of 0.85, we use the *auc* argument (which must be between 0.5 and 1): + +```{r} +C1 <- logisticCoefs(d1, coefs1, popPrev = 0.40, auc = 0.85) +C1 +``` + +Again, using $C_1$ in the data generation process, we will get a data set with the desired target prevalence and the AUC (calculated here using the `lrm` function in the `rms` package: + +```{r} +d1a <- defData(d1, varname = "y", + formula = "t(..C1) %*% c(1, x1, x2, b1, b2)", + dist = "binary", link = "logit" +) + +dd <- genData(500000, d1a) + +dd[, mean(y)] + +fit <- rms::lrm(y ~ x1 + x2 + b1 + b2, data = dd) +fit$stats["C"] + +``` + + +

+References: + +Austin, Peter C. "The iterative bisection procedure: a useful +tool for determining parameter values in data-generating processes in +Monte Carlo simulations." BMC Medical Research Methodology 23, +no. 1 (2023): 1-10. + +

\ No newline at end of file diff --git a/vignettes/longitudinal.Rmd b/vignettes/longitudinal.Rmd index 084c2f5..7032300 100644 --- a/vignettes/longitudinal.Rmd +++ b/vignettes/longitudinal.Rmd @@ -7,6 +7,10 @@ vignette: > \usepackage[utf8]{inputenc} --- +```{r chunkname, echo=-1} +data.table::setDTthreads(2) +``` + ```{r, echo = FALSE, message = FALSE} library(simstudy) library(ggplot2) diff --git a/vignettes/missing.Rmd b/vignettes/missing.Rmd index 4aa2f74..ce4f5d8 100644 --- a/vignettes/missing.Rmd +++ b/vignettes/missing.Rmd @@ -7,6 +7,10 @@ vignette: > \usepackage[utf8]{inputenc} --- +```{r chunkname, echo=-1} +data.table::setDTthreads(2) +``` + ```{r, echo = FALSE, message = FALSE} library(simstudy) library(ggplot2) diff --git a/vignettes/ordinal.Rmd b/vignettes/ordinal.Rmd index 162d5f6..ba365ff 100644 --- a/vignettes/ordinal.Rmd +++ b/vignettes/ordinal.Rmd @@ -7,6 +7,10 @@ vignette: > \usepackage[utf8]{inputenc} --- +```{r chunkname, echo=-1} +data.table::setDTthreads(2) +``` + ```{r, echo = FALSE, message = FALSE} library(simstudy) library(ggplot2) diff --git a/vignettes/simstudy.Rmd b/vignettes/simstudy.Rmd index 654a157..a97541b 100644 --- a/vignettes/simstudy.Rmd +++ b/vignettes/simstudy.Rmd @@ -7,6 +7,10 @@ vignette: > \usepackage[utf8]{inputenc} --- +```{r chunkname, echo=-1} +data.table::setDTthreads(2) +``` + ```{r, echo = FALSE, message = FALSE} library(simstudy) diff --git a/vignettes/spline.Rmd b/vignettes/spline.Rmd index 5a18e33..8d07ae4 100644 --- a/vignettes/spline.Rmd +++ b/vignettes/spline.Rmd @@ -7,6 +7,10 @@ vignette: > \usepackage[utf8]{inputenc} --- +```{r chunkname, echo=-1} +data.table::setDTthreads(2) +``` + ```{r, echo = FALSE, message = FALSE} library(simstudy) library(ggplot2) diff --git a/vignettes/survival.Rmd b/vignettes/survival.Rmd index c59f1b6..25b48ab 100644 --- a/vignettes/survival.Rmd +++ b/vignettes/survival.Rmd @@ -7,6 +7,9 @@ vignette: > \usepackage[utf8]{inputenc} --- +```{r chunkname, echo=-1} +data.table::setDTthreads(2) +``` ```{r, echo = FALSE, message = FALSE} diff --git a/vignettes/treat_and_exposure.Rmd b/vignettes/treat_and_exposure.Rmd index 3420bf9..f19b3e0 100644 --- a/vignettes/treat_and_exposure.Rmd +++ b/vignettes/treat_and_exposure.Rmd @@ -7,6 +7,10 @@ vignette: > \usepackage[utf8]{inputenc} --- +```{r chunkname, echo=-1} +data.table::setDTthreads(2) +``` + ```{r, echo = FALSE, message=FALSE} library(simstudy)