From 782acafdc0d10c0403e06d9319f8f1525f9f5858 Mon Sep 17 00:00:00 2001
From: Keith Goldfeld !K_hpp$|`5pKj{t7L5;=$pAR00E@r3(*(
zFZqMN?V)O?x!4NL@y~MhS?pfnYuL^~qsqL_@vs@J{u^xj{r*?Cqe3x@9U(a8VCng~
Rf_!-T{Rh~7aQQI?000=N0^ We start by defining the school level data: The classroom level data are generated with a call to
Finally, the student level data are added using the same process: This is what the clustered data look like. Each classroom is
represented by a box, and each school is represented by a color. The
intervention group is highlighted by dark outlines:Clustered Data
classrooms, so the simulation provides random effects at each
of these levels.<- defData(varname = "s0", dist = "normal", formula = 0, variance = 3,
- gen.school id = "idSchool")
- <- defData(gen.school, varname = "nClasses", dist = "noZeroPoisson", formula = 3)
- gen.school
-set.seed(282721)
-
-<- genData(8, gen.school)
- dtSchool <- trtAssign(dtSchool, n = 2)
- dtSchool
- dtSchool
gen.school <- defData(varname = "s0", dist = "normal", formula = 0, variance = 3,
+ id = "idSchool")
+gen.school <- defData(gen.school, varname = "nClasses", dist = "noZeroPoisson", formula = 3)
+
+set.seed(282721)
+
+dtSchool <- genData(8, gen.school)
+dtSchool <- trtAssign(dtSchool, n = 2)
+
+dtSchool
## idSchool s0 nClasses trtGrp
## 1: 1 0.9732297 3 1
## 2: 2 3.5741932 4 1
@@ -378,14 +378,14 @@
Clustered Data
genCluster
, and then school level data is added by a call
to addColumns
:<- defDataAdd(varname = "c0", dist = "normal", formula = 0, variance = 2)
- gen.class <- defDataAdd(gen.class, varname = "nStudents", dist = "noZeroPoisson",
- gen.class formula = 20)
-
-<- genCluster(dtSchool, "idSchool", numIndsVar = "nClasses", level1ID = "idClass")
- dtClass <- addColumns(gen.class, dtClass)
- dtClass
-head(dtClass, 10)
gen.class <- defDataAdd(varname = "c0", dist = "normal", formula = 0, variance = 2)
+gen.class <- defDataAdd(gen.class, varname = "nStudents", dist = "noZeroPoisson",
+ formula = 20)
+
+dtClass <- genCluster(dtSchool, "idSchool", numIndsVar = "nClasses", level1ID = "idClass")
+dtClass <- addColumns(gen.class, dtClass)
+
+head(dtClass, 10)
## idSchool s0 nClasses trtGrp idClass c0 nStudents
## 1: 1 0.9732297 3 1 1 1.62726560 16
## 2: 1 0.9732297 3 1 2 -0.69640102 16
@@ -398,16 +398,16 @@
Clustered Data
## 9: 3 0.1121028 3 0 9 0.07024057 19
## 10: 3 0.1121028 3 0 10 1.09465368 21<- defDataAdd(varname = "Male", dist = "binary",
- gen.student formula = 0.5)
- <- defDataAdd(gen.student, varname = "age", dist = "uniform",
- gen.student formula = "9.5; 10.5")
- <- defDataAdd(gen.student, varname = "test", dist = "normal",
- gen.student formula = "50 - 5*Male + s0 + c0 + 8 * trtGrp", variance = 2)
- <- genCluster(dtClass, cLevelVar = "idClass", numIndsVar = "nStudents",
- dtStudent level1ID = "idChild")
-
-<- addColumns(gen.student, dtStudent) dtStudent
gen.student <- defDataAdd(varname = "Male", dist = "binary",
+ formula = 0.5)
+gen.student <- defDataAdd(gen.student, varname = "age", dist = "uniform",
+ formula = "9.5; 10.5")
+gen.student <- defDataAdd(gen.student, varname = "test", dist = "normal",
+ formula = "50 - 5*Male + s0 + c0 + 8 * trtGrp", variance = 2)
+dtStudent <- genCluster(dtClass, cLevelVar = "idClass", numIndsVar = "nStudents",
+ level1ID = "idChild")
+
+dtStudent <- addColumns(gen.student, dtStudent)
Setting cluster sizes
0 (the default) implies exact balance, and larger values of dispersion
imply more variability in the cluster sizes.
Here are two examples with exact or close to exact balance:
-<- defData(varname = "clustSize", formula = 120, dist = "clusterSize")
- d1
-genData(8, d1, id = "site")
d1 <- defData(varname = "clustSize", formula = 120, dist = "clusterSize")
+
+genData(8, d1, id = "site")
## site clustSize
## 1: 1 15
## 2: 2 15
@@ -441,7 +441,7 @@ Setting cluster sizes
## 6: 6 15
## 7: 7 15
## 8: 8 15
-genData(7, d1, id = "site")
## site clustSize
## 1: 1 17
## 2: 2 17
@@ -451,10 +451,10 @@ Setting cluster sizes
## 6: 6 18
## 7: 7 17
This is a second example with variability across sites:
-<- defData(varname = "clustSize", formula = 120,
- d1 variance = .1, dist = "clusterSize")
-
-genData(8, d1, id = "site")
d1 <- defData(varname = "clustSize", formula = 120,
+ variance = .1, dist = "clusterSize")
+
+genData(8, d1, id = "site")
## site clustSize
## 1: 1 22
## 2: 2 14
diff --git a/inst/doc/corelationmat.R b/inst/doc/corelationmat.R
index 3981185..6406a4a 100644
--- a/inst/doc/corelationmat.R
+++ b/inst/doc/corelationmat.R
@@ -1,4 +1,7 @@
-## ---- echo = FALSE, message = FALSE-------------------------------------------
+## ----chunkname, echo=-1-------------------------------------------------------
+data.table::setDTthreads(2)
+
+## ----echo = FALSE, message = FALSE--------------------------------------------
library(simstudy)
library(ggplot2)
library(scales)
@@ -28,14 +31,14 @@ ggtheme <- function(panelback = "white") {
}
-## ---- message=FALSE-----------------------------------------------------------
+## ----message=FALSE------------------------------------------------------------
library(simstudy)
library(data.table)
set.seed(37265)
genCorMat(4)
-## ---- message=FALSE-----------------------------------------------------------
+## ----message=FALSE------------------------------------------------------------
R <- genCorMat(4, cors = c(0.6, 0.4, 0.2, 0.5, 0.3, 0.8))
R
@@ -48,7 +51,7 @@ head(dd)
## -----------------------------------------------------------------------------
round(cor(as.matrix(dd[, -1])), 1)
-## ---- echo=FALSE--------------------------------------------------------------
+## ----echo=FALSE---------------------------------------------------------------
matform <- "
R = \\left (
\\begin{matrix}
@@ -65,7 +68,7 @@ katex::katex_html(matform, include_css = TRUE)
## -----------------------------------------------------------------------------
genCorMat(nvars = 4, rho = 0.6, corstr = "cs")
-## ---- echo=FALSE--------------------------------------------------------------
+## ----echo=FALSE---------------------------------------------------------------
matform <- "
R = \\left (
\\begin{matrix}
@@ -82,7 +85,7 @@ katex::katex_html(matform)
## -----------------------------------------------------------------------------
genCorMat(nvars = 4, rho = 0.6, corstr = "ar1")
-## ---- echo=FALSE--------------------------------------------------------------
+## ----echo=FALSE---------------------------------------------------------------
matform <- "\\footnotesize{
R = \\left ( \\begin{array}{c|c|c|c}
@@ -212,7 +215,7 @@ dd <- addCorGen(dc, idvar = "site", param1 = "lambda", corMatrix = RC,
dd
-## ---- eval=FALSE--------------------------------------------------------------
+## ----eval=FALSE---------------------------------------------------------------
# replicate <- function(R, dc) {
# reps <- lapply(1:5000, function(x)
# addCorGen(dc, idvar = "site", param1 = "lambda", corMatrix = R,
@@ -228,7 +231,7 @@ dd
# replicate(R = RC, dc = dc)
#
-## ---- eval=TRUE---------------------------------------------------------------
+## ----eval=TRUE----------------------------------------------------------------
d1 <- defData(varname = "n", formula = 20, dist = "noZeroPoisson")
d1 <- defData(d1, varname = "mu", formula = 10, variance = 8, dist = "normal")
d1 <- defData(d1, varname = "s2", formula = 4, dist = "nonrandom")
@@ -252,7 +255,7 @@ dd <- addCorGen(dc, idvar = "site", param1 = "mu", param2 = "s2",
dd
-## ---- echo=FALSE--------------------------------------------------------------
+## ----echo=FALSE---------------------------------------------------------------
library(katex)
matform <- "\\footnotesize{
@@ -311,7 +314,7 @@ R = \\left ( \\begin{array}{c|c|c}
katex_html(matform)
-## ---- echo=FALSE--------------------------------------------------------------
+## ----echo=FALSE---------------------------------------------------------------
matform <- "\\footnotesize{
R = \\left ( \\begin{array}{c|c|c}
@@ -369,7 +372,7 @@ R = \\left ( \\begin{array}{c|c|c}
katex_html(matform)
-## ---- echo=FALSE--------------------------------------------------------------
+## ----echo=FALSE---------------------------------------------------------------
matform <- "\\footnotesize{
R = \\left ( \\begin{array}{c|c|c}
@@ -427,7 +430,7 @@ R = \\left ( \\begin{array}{c|c|c}
katex_html(matform)
-## ---- echo=FALSE--------------------------------------------------------------
+## ----echo=FALSE---------------------------------------------------------------
matform <- "\\footnotesize{
R = \\left ( \\begin{array}{c|c|c}
@@ -509,7 +512,7 @@ R_XD
dd <- genCorGen(n = 5000, nvars = 6, corMatrix = R_XD,
dist = "poisson", params1 = 7, wide = TRUE)
-## ---- echo=FALSE--------------------------------------------------------------
+## ----echo=FALSE---------------------------------------------------------------
round(cor(as.matrix(dd[, -1])), 2)
## -----------------------------------------------------------------------------
@@ -521,7 +524,7 @@ R_CE
dd <- genCorGen(n = 5000, nvars = 6, corMatrix = R_CE,
dist = "poisson", params1 = 7, wide = TRUE)
-## ---- echo=FALSE--------------------------------------------------------------
+## ----echo=FALSE---------------------------------------------------------------
round(cor(as.matrix(dd[, -1])), 2)
## -----------------------------------------------------------------------------
@@ -533,7 +536,7 @@ R_CD
dd <- genCorGen(n = 5000, nvars = 6, corMatrix = R_CD,
dist = "poisson", params1 = 7, wide = TRUE)
-## ---- echo=FALSE--------------------------------------------------------------
+## ----echo=FALSE---------------------------------------------------------------
round(cor(as.matrix(dd[, -1])), 2)
## -----------------------------------------------------------------------------
@@ -563,7 +566,7 @@ R <- blockDecayMat(ninds = N , nperiods = 3, rho_w = 0.6, r = r, nclusters = 10)
lapply(R, function(x) round(x,2))[c(1, 3, 7)]
-## ---- eval=FALSE--------------------------------------------------------------
+## ----eval=FALSE---------------------------------------------------------------
# reps <- lapply(1:5000,
# function(x) addCorGen(dd, idvar = "site", corMatrix = R,
# dist = "poisson", param1 = "lambda", cnames = "y")
diff --git a/inst/doc/corelationmat.Rmd b/inst/doc/corelationmat.Rmd
index 816af70..60f2e07 100644
--- a/inst/doc/corelationmat.Rmd
+++ b/inst/doc/corelationmat.Rmd
@@ -7,6 +7,9 @@ vignette: >
\usepackage[utf8]{inputenc}
---
+```{r chunkname, echo=-1}
+data.table::setDTthreads(2)
+```
```{r, echo = FALSE, message = FALSE}
library(simstudy)
diff --git a/inst/doc/corelationmat.html b/inst/doc/corelationmat.html
index 558eab7..6e60d2e 100644
--- a/inst/doc/corelationmat.html
+++ b/inst/doc/corelationmat.html
@@ -356,11 +356,11 @@ Simple correlation matrix generation
correlation matrix with a set of specified coefficients.
Here is an example of the first, a randomly generated correlation
matrix:
-library(simstudy)
-library(data.table)
-set.seed(37265)
-
-genCorMat(4)
+
## [,1] [,2] [,3] [,4]
## [1,] 1.00000000 -0.22742403 0.01285282 -0.3201579
## [2,] -0.22742403 1.00000000 -0.04973973 -0.1218070
@@ -370,8 +370,8 @@ Simple correlation matrix generation
well get an error message if it is not positive semidefinite!). These
coefficients define the lower triangle (and upper) triangle of the
matrix, reading down the columns:
-<- genCorMat(4, cors = c(0.6, 0.4, 0.2, 0.5, 0.3, 0.8))
- R R
+
## [,1] [,2] [,3] [,4]
## [1,] 1.0 0.6 0.4 0.2
## [2,] 0.6 1.0 0.5 0.3
@@ -379,10 +379,10 @@ Simple correlation matrix generation
## [4,] 0.2 0.3 0.8 1.0
This matrix can be used to generate data using functions
genCorData
or genCorGen
:
-<- genCorGen(n = 1000, nvars = 4, corMatrix = R, params1 = c(3, 5, 8, 9),
- dd dist = "poisson", wide = TRUE)
-
-head(dd)
+dd <- genCorGen(n = 1000, nvars = 4, corMatrix = R, params1 = c(3, 5, 8, 9),
+ dist = "poisson", wide = TRUE)
+
+head(dd)
## id V1 V2 V3 V4
## 1: 1 3 3 5 6
## 2: 2 3 9 12 10
@@ -392,7 +392,7 @@ Simple correlation matrix generation
## 6: 6 4 5 6 7
And the correlation from this data set is quite close to the
specified matrix R.
-round(cor(as.matrix(dd[, -1])), 1)
+
## V1 V2 V3 V4
## V1 1.0 0.6 0.4 0.2
## V2 0.6 1.0 0.5 0.3
@@ -429,7 +429,7 @@ Specifying a structure
c242.7,-294.7,395.3,-681.7,458,-1161c21.3,-164.7,33.3,-350.7,36,-558
l0,-1344c-2,-159.3,-10,-310.7,-24,-454c-53.3,-528,-210,-949.7,
-470,-1265c-4.7,-6,-9.7,-11.7,-15,-17c-0.7,-0.7,-6.7,-1,-18,-1z">
-genCorMat(nvars = 4, rho = 0.6, corstr = "cs")
+
## [,1] [,2] [,3] [,4]
## [1,] 1.0 0.6 0.6 0.6
## [2,] 0.6 1.0 0.6 0.6
@@ -462,7 +462,7 @@ Specifying a structure
c242.7,-294.7,395.3,-681.7,458,-1161c21.3,-164.7,33.3,-350.7,36,-558
l0,-1344c-2,-159.3,-10,-310.7,-24,-454c-53.3,-528,-210,-949.7,
-470,-1265c-4.7,-6,-9.7,-11.7,-15,-17c-0.7,-0.7,-6.7,-1,-18,-1z">
-genCorMat(nvars = 4, rho = 0.6, corstr = "ar1")
+
## [,1] [,2] [,3] [,4]
## [1,] 1.000 0.60 0.36 0.216
## [2,] 0.600 1.00 0.60 0.360
@@ -619,10 +619,10 @@ Cluster-specific correlation matrices
clusters (though either or both can be scalars, in which case the values
are shared across the clusters). The output is a list of correlation
matrices, one for each cluster.
-<- genCorMat(nvars = c(2, 3, 4, 3), rho = c(0.6, 0.7, 0.5, 0.4),
- RC corstr = "cs", nclusters = 4)
-
- RC
+RC <- genCorMat(nvars = c(2, 3, 4, 3), rho = c(0.6, 0.7, 0.5, 0.4),
+ corstr = "cs", nclusters = 4)
+
+RC
## $`1`
## [,1] [,2]
## [1,] 1.0 0.6
@@ -649,17 +649,17 @@ Cluster-specific correlation matrices
To create the correlated data, first we can generate a data set of
individuals that are clustered in groups. The outcome will be Poisson
distributed, so we are specifying mean \(\lambda\) for each cluster:
-<- defData(varname = "n", formula = "c(2, 3, 4, 3)", dist = "nonrandom")
- d1 <- defData(d1, varname = "lambda", formula = "c(6, 7, 9, 8)", dist = "nonrandom")
- d1
-<- genData(4, d1, id = "site")
- ds <- genCluster(dtClust = ds, cLevelVar = "site", numIndsVar = "n", "id") dc
+d1 <- defData(varname = "n", formula = "c(2, 3, 4, 3)", dist = "nonrandom")
+d1 <- defData(d1, varname = "lambda", formula = "c(6, 7, 9, 8)", dist = "nonrandom")
+
+ds <- genData(4, d1, id = "site")
+dc <- genCluster(dtClust = ds, cLevelVar = "site", numIndsVar = "n", "id")
Now, we can generate data using the correlation matrix
RC:
-<- addCorGen(dc, idvar = "site", param1 = "lambda", corMatrix = RC,
- dd dist = "poisson", cnames = "y", method = "copula")
-
- dd
+dd <- addCorGen(dc, idvar = "site", param1 = "lambda", corMatrix = RC,
+ dist = "poisson", cnames = "y", method = "copula")
+
+dd
## site n lambda id y
## 1: 1 2 6 1 11
## 2: 1 2 6 2 7
@@ -676,19 +676,19 @@ Cluster-specific correlation matrices
If we want to confirm that everything is working as expected, we can
recover the overall correlation matrix by generating a large number of
data sets (in this case 5000):
-<- function(R, dc) {
- replicate <- lapply(1:5000, function(x)
- reps addCorGen(dc, idvar = "site", param1 = "lambda", corMatrix = R,
- dist = "poisson", cnames = "y", method = "copula")
-
- )
-<- data.table::rbindlist(reps, idcol = "rep")
- drep := 1:.N, keyby = rep]
- drep[, seq <- as.matrix(dcast(drep, rep ~ seq, value.var = "y")[, -1])
- dmat round(cor(dmat), 1)
-
- }
-replicate(R = RC, dc = dc)
+replicate <- function(R, dc) {
+ reps <- lapply(1:5000, function(x)
+ addCorGen(dc, idvar = "site", param1 = "lambda", corMatrix = R,
+ dist = "poisson", cnames = "y", method = "copula")
+ )
+
+ drep <- data.table::rbindlist(reps, idcol = "rep")
+ drep[, seq := 1:.N, keyby = rep]
+ dmat <- as.matrix(dcast(drep, rep ~ seq, value.var = "y")[, -1])
+ round(cor(dmat), 1)
+}
+
+replicate(R = RC, dc = dc)
It seems to have worked quite well - the empirical matrix matches the
hypothetical matrix above. In the next post, I’ll describe how block
matrices for different clusters over different time periods can also be
@@ -700,21 +700,21 @@
More elaborate example
coefficients) themselves are randomly generated. By providing this
flexibility, we induce extra variability in the data generation
process.
-<- defData(varname = "n", formula = 20, dist = "noZeroPoisson")
- d1 <- defData(d1, varname = "mu", formula = 10, variance = 8, dist = "normal")
- d1 <- defData(d1, varname = "s2", formula = 4, dist = "nonrandom")
- d1
-<- genData(100, d1, id = "site")
- ds <- genCluster(dtClust = ds, cLevelVar = "site", numIndsVar = "n", "id")
- dc
-<- dc[, .N, keyby = site][, N]
- n <- length(n)
- nsites <- rbeta(nsites, 25, 15)
- rho
-<- genCorMat(nvars = n, rho = rho, corstr = "cs", nclusters = nsites) RM
+d1 <- defData(varname = "n", formula = 20, dist = "noZeroPoisson")
+d1 <- defData(d1, varname = "mu", formula = 10, variance = 8, dist = "normal")
+d1 <- defData(d1, varname = "s2", formula = 4, dist = "nonrandom")
+
+ds <- genData(100, d1, id = "site")
+dc <- genCluster(dtClust = ds, cLevelVar = "site", numIndsVar = "n", "id")
+
+n <- dc[, .N, keyby = site][, N]
+nsites <- length(n)
+rho <- rbeta(nsites, 25, 15)
+
+RM <- genCorMat(nvars = n, rho = rho, corstr = "cs", nclusters = nsites)
Here are the first three rows and columns of the correlation matrices
for three clusters, as well as the dimensions for each matrix.
-lapply(RM[c(1, 38, 97)], function(x) x[1:3, 1:3])
+
## $`1`
## [,1] [,2] [,3]
## [1,] 1.0000000 0.6155452 0.6155452
@@ -732,7 +732,7 @@ More elaborate example
## [1,] 1.0000000 0.6292113 0.6292113
## [2,] 0.6292113 1.0000000 0.6292113
## [3,] 0.6292113 0.6292113 1.0000000
-lapply(RM[c(1, 38, 97)], function(x) dim(x))
+
## $`1`
## [1] 20 20
##
@@ -742,10 +742,10 @@ More elaborate example
## $`97`
## [1] 24 24
And here is how we generate the data
-<- addCorGen(dc, idvar = "site", param1 = "mu", param2 = "s2",
- dd corMatrix = RM, dist = "normal", cnames = "y", method = "copula")
-
- dd
+dd <- addCorGen(dc, idvar = "site", param1 = "mu", param2 = "s2",
+ corMatrix = RM, dist = "normal", cnames = "y", method = "copula")
+
+dd
## site n mu s2 id y
## 1: 1 20 6.141913 4 1 5.104369
## 2: 1 20 6.141913 4 2 7.747311
@@ -1138,13 +1138,13 @@ Generating block matrices and simulating data
Cross-sectional data with exchangeable correlation
In the first example, we specify \(\rho_w =
0.5\) and \(\rho_b = 0.3\):
-library(simstudy)
-library(data.table)
-
-<- blockExchangeMat(ninds = 2 , nperiods = 3, rho_w = 0.5,
- R_XE rho_b = 0.3, pattern = "xsection")
-
- R_XE
+library(simstudy)
+library(data.table)
+
+R_XE <- blockExchangeMat(ninds = 2 , nperiods = 3, rho_w = 0.5,
+ rho_b = 0.3, pattern = "xsection")
+
+R_XE
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 1.0 0.5 0.3 0.3 0.3 0.3
## [2,] 0.5 1.0 0.3 0.3 0.3 0.3
@@ -1157,10 +1157,10 @@ Cross-sectional data with exchangeable correlation
effectively generating 5000 data sets with 6 observations each, all
based on a Poisson distribution with mean = 7. I then report the
empirical correlation matrix.
-<- genCorGen(n = 5000, nvars = 6, corMatrix = R_XE,
- dd dist = "poisson", params1 = 7, wide = TRUE)
-
-round(cor(as.matrix(dd[, -1])), 2)
+dd <- genCorGen(n = 5000, nvars = 6, corMatrix = R_XE,
+ dist = "poisson", params1 = 7, wide = TRUE)
+
+round(cor(as.matrix(dd[, -1])), 2)
## V1 V2 V3 V4 V5 V6
## V1 1.00 0.48 0.29 0.29 0.29 0.30
## V2 0.48 1.00 0.25 0.28 0.29 0.28
@@ -1173,10 +1173,10 @@ Cross-sectional data with exchangeable correlation
Cross-sectional data with correlation decay
Here, there is a decay parameter \(r =
0.8\) and no parameter \(\rho_b\).
-<- blockDecayMat(ninds = 2 , nperiods = 3, rho_w = 0.5,
- R_XD r = 0.8, pattern = "xsection")
-
- R_XD
+R_XD <- blockDecayMat(ninds = 2 , nperiods = 3, rho_w = 0.5,
+ r = 0.8, pattern = "xsection")
+
+R_XD
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 1.00 0.50 0.4 0.4 0.32 0.32
## [2,] 0.50 1.00 0.4 0.4 0.32 0.32
@@ -1184,8 +1184,8 @@ Cross-sectional data with correlation decay
## [4,] 0.40 0.40 0.5 1.0 0.40 0.40
## [5,] 0.32 0.32 0.4 0.4 1.00 0.50
## [6,] 0.32 0.32 0.4 0.4 0.50 1.00
-<- genCorGen(n = 5000, nvars = 6, corMatrix = R_XD,
- dd dist = "poisson", params1 = 7, wide = TRUE)
+dd <- genCorGen(n = 5000, nvars = 6, corMatrix = R_XD,
+ dist = "poisson", params1 = 7, wide = TRUE)
## V1 V2 V3 V4 V5 V6
## V1 1.00 0.51 0.38 0.41 0.31 0.31
## V2 0.51 1.00 0.38 0.39 0.30 0.30
@@ -1197,10 +1197,10 @@ Cross-sectional data with correlation decay
Cohort data with exchangeable correlation
Since we have a cohort, we introduce \(\rho_a\) = 0.4, and specify \(pattern = \text{"cohort"}\):
-<- blockExchangeMat(ninds = 2 , nperiods = 3, rho_w = 0.5,
- R_CE rho_b = 0.3, rho_a = 0.4, pattern = "cohort")
-
- R_CE
+R_CE <- blockExchangeMat(ninds = 2 , nperiods = 3, rho_w = 0.5,
+ rho_b = 0.3, rho_a = 0.4, pattern = "cohort")
+
+R_CE
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 1.0 0.5 0.4 0.3 0.4 0.3
## [2,] 0.5 1.0 0.3 0.4 0.3 0.4
@@ -1208,8 +1208,8 @@ Cohort data with exchangeable correlation
## [4,] 0.3 0.4 0.5 1.0 0.3 0.4
## [5,] 0.4 0.3 0.4 0.3 1.0 0.5
## [6,] 0.3 0.4 0.3 0.4 0.5 1.0
-<- genCorGen(n = 5000, nvars = 6, corMatrix = R_CE,
- dd dist = "poisson", params1 = 7, wide = TRUE)
+dd <- genCorGen(n = 5000, nvars = 6, corMatrix = R_CE,
+ dist = "poisson", params1 = 7, wide = TRUE)
## V1 V2 V3 V4 V5 V6
## V1 1.00 0.49 0.39 0.27 0.39 0.29
## V2 0.49 1.00 0.29 0.40 0.30 0.41
@@ -1224,10 +1224,10 @@ Cohort data with correlation decay
a cohort is the same as a decay in the case of a cross sectional design;
the only difference that we set \(pattern =
\text{"cohort"}\):
-<- blockDecayMat(ninds = 2 , nperiods = 3, rho_w = 0.5,
- R_CD r = 0.8, pattern = "cohort")
-
- R_CD
+
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 1.00 0.50 0.8 0.4 0.64 0.32
## [2,] 0.50 1.00 0.4 0.8 0.32 0.64
@@ -1235,8 +1235,8 @@ Cohort data with correlation decay
## [4,] 0.40 0.80 0.5 1.0 0.40 0.80
## [5,] 0.64 0.32 0.8 0.4 1.00 0.50
## [6,] 0.32 0.64 0.4 0.8 0.50 1.00
-<- genCorGen(n = 5000, nvars = 6, corMatrix = R_CD,
- dd dist = "poisson", params1 = 7, wide = TRUE)
+dd <- genCorGen(n = 5000, nvars = 6, corMatrix = R_CD,
+ dist = "poisson", params1 = 7, wide = TRUE)
## V1 V2 V3 V4 V5 V6
## V1 1.00 0.47 0.79 0.38 0.63 0.30
## V2 0.47 1.00 0.38 0.78 0.30 0.62
@@ -1259,96 +1259,94 @@ Varying correlation matrices by cluster
beta distribution with shape parameters 6 and 2). The parameter
\(\rho_w\) is constant across all
clusters and is 0.6
-<- defData(varname = "lambda", formula = "sample(5:10, 1)", dist = "nonrandom")
- defC <- defDataAdd(varname = "n", formula = "2;4", dist="uniformInt")
- defP
-<- genData(n = 10, dtDefs = defC, id = "site")
- dc <- addPeriods(dtName = dc, nPeriods = 3,
- dc idvars = "site", perName = "period")
- <- addColumns(defP, dc)
- dc
-<- genCluster(dtClust = dc, cLevelVar = "timeID",
- dd numIndsVar = "n", level1ID = "id")
+defC <- defData(varname = "lambda", formula = "sample(5:10, 1)", dist = "nonrandom")
+defP <- defDataAdd(varname = "n", formula = "2;4", dist="uniformInt")
+
+dc <- genData(n = 10, dtDefs = defC, id = "site")
+dc <- addPeriods(dtName = dc, nPeriods = 3,
+ idvars = "site", perName = "period")
+dc <- addColumns(defP, dc)
+
+dd <- genCluster(dtClust = dc, cLevelVar = "timeID",
+ numIndsVar = "n", level1ID = "id")
In this example, the 10 clusters will have varying numbers of
observations per period. Here are the counts for three sites:
-%in% c(1, 3, 7), .(site, period, n)] dc[site
+
## site period n
## 1: 1 0 4
## 2: 1 1 2
-## 3: 1 2 2
+## 3: 1 2 3
## 4: 3 0 3
-## 5: 3 1 2
+## 5: 3 1 3
## 6: 3 2 3
-## 7: 7 0 4
-## 8: 7 1 4
-## 9: 7 2 4
+## 7: 7 0 3
+## 8: 7 1 2
+## 9: 7 2 3
The sites will also have unique decay rates:
-<- round(rbeta(10, 6, 2), 2)
- r c(1, 3, 7)] r[
-## [1] 0.76 0.94 0.87
+
+## [1] 0.66 0.85 0.81
Here are the correlation matrices for these three sites:
-<- dd[, .N, keyby = .(site, period)][, N]
- N
-<- blockDecayMat(ninds = N , nperiods = 3, rho_w = 0.6, r = r, nclusters = 10)
- R
-lapply(R, function(x) round(x,2))[c(1, 3, 7)]
+N <- dd[, .N, keyby = .(site, period)][, N]
+
+R <- blockDecayMat(ninds = N , nperiods = 3, rho_w = 0.6, r = r, nclusters = 10)
+
+lapply(R, function(x) round(x,2))[c(1, 3, 7)]
## [[1]]
-## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
-## [1,] 1.00 0.60 0.60 0.60 0.46 0.46 0.35 0.35
-## [2,] 0.60 1.00 0.60 0.60 0.46 0.46 0.35 0.35
-## [3,] 0.60 0.60 1.00 0.60 0.46 0.46 0.35 0.35
-## [4,] 0.60 0.60 0.60 1.00 0.46 0.46 0.35 0.35
-## [5,] 0.46 0.46 0.46 0.46 1.00 0.60 0.46 0.46
-## [6,] 0.46 0.46 0.46 0.46 0.60 1.00 0.46 0.46
-## [7,] 0.35 0.35 0.35 0.35 0.46 0.46 1.00 0.60
-## [8,] 0.35 0.35 0.35 0.35 0.46 0.46 0.60 1.00
+## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
+## [1,] 1.00 0.60 0.60 0.60 0.4 0.4 0.26 0.26 0.26
+## [2,] 0.60 1.00 0.60 0.60 0.4 0.4 0.26 0.26 0.26
+## [3,] 0.60 0.60 1.00 0.60 0.4 0.4 0.26 0.26 0.26
+## [4,] 0.60 0.60 0.60 1.00 0.4 0.4 0.26 0.26 0.26
+## [5,] 0.40 0.40 0.40 0.40 1.0 0.6 0.40 0.40 0.40
+## [6,] 0.40 0.40 0.40 0.40 0.6 1.0 0.40 0.40 0.40
+## [7,] 0.26 0.26 0.26 0.26 0.4 0.4 1.00 0.60 0.60
+## [8,] 0.26 0.26 0.26 0.26 0.4 0.4 0.60 1.00 0.60
+## [9,] 0.26 0.26 0.26 0.26 0.4 0.4 0.60 0.60 1.00
##
## [[2]]
-## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
-## [1,] 1.00 0.60 0.60 0.56 0.56 0.53 0.53 0.53
-## [2,] 0.60 1.00 0.60 0.56 0.56 0.53 0.53 0.53
-## [3,] 0.60 0.60 1.00 0.56 0.56 0.53 0.53 0.53
-## [4,] 0.56 0.56 0.56 1.00 0.60 0.56 0.56 0.56
-## [5,] 0.56 0.56 0.56 0.60 1.00 0.56 0.56 0.56
-## [6,] 0.53 0.53 0.53 0.56 0.56 1.00 0.60 0.60
-## [7,] 0.53 0.53 0.53 0.56 0.56 0.60 1.00 0.60
-## [8,] 0.53 0.53 0.53 0.56 0.56 0.60 0.60 1.00
+## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
+## [1,] 1.00 0.60 0.60 0.51 0.51 0.51 0.43 0.43 0.43
+## [2,] 0.60 1.00 0.60 0.51 0.51 0.51 0.43 0.43 0.43
+## [3,] 0.60 0.60 1.00 0.51 0.51 0.51 0.43 0.43 0.43
+## [4,] 0.51 0.51 0.51 1.00 0.60 0.60 0.51 0.51 0.51
+## [5,] 0.51 0.51 0.51 0.60 1.00 0.60 0.51 0.51 0.51
+## [6,] 0.51 0.51 0.51 0.60 0.60 1.00 0.51 0.51 0.51
+## [7,] 0.43 0.43 0.43 0.51 0.51 0.51 1.00 0.60 0.60
+## [8,] 0.43 0.43 0.43 0.51 0.51 0.51 0.60 1.00 0.60
+## [9,] 0.43 0.43 0.43 0.51 0.51 0.51 0.60 0.60 1.00
##
## [[3]]
-## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12]
-## [1,] 1.00 0.60 0.60 0.60 0.52 0.52 0.52 0.52 0.45 0.45 0.45 0.45
-## [2,] 0.60 1.00 0.60 0.60 0.52 0.52 0.52 0.52 0.45 0.45 0.45 0.45
-## [3,] 0.60 0.60 1.00 0.60 0.52 0.52 0.52 0.52 0.45 0.45 0.45 0.45
-## [4,] 0.60 0.60 0.60 1.00 0.52 0.52 0.52 0.52 0.45 0.45 0.45 0.45
-## [5,] 0.52 0.52 0.52 0.52 1.00 0.60 0.60 0.60 0.52 0.52 0.52 0.52
-## [6,] 0.52 0.52 0.52 0.52 0.60 1.00 0.60 0.60 0.52 0.52 0.52 0.52
-## [7,] 0.52 0.52 0.52 0.52 0.60 0.60 1.00 0.60 0.52 0.52 0.52 0.52
-## [8,] 0.52 0.52 0.52 0.52 0.60 0.60 0.60 1.00 0.52 0.52 0.52 0.52
-## [9,] 0.45 0.45 0.45 0.45 0.52 0.52 0.52 0.52 1.00 0.60 0.60 0.60
-## [10,] 0.45 0.45 0.45 0.45 0.52 0.52 0.52 0.52 0.60 1.00 0.60 0.60
-## [11,] 0.45 0.45 0.45 0.45 0.52 0.52 0.52 0.52 0.60 0.60 1.00 0.60
-## [12,] 0.45 0.45 0.45 0.45 0.52 0.52 0.52 0.52 0.60 0.60 0.60 1.00
+## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
+## [1,] 1.00 0.60 0.60 0.49 0.49 0.39 0.39 0.39
+## [2,] 0.60 1.00 0.60 0.49 0.49 0.39 0.39 0.39
+## [3,] 0.60 0.60 1.00 0.49 0.49 0.39 0.39 0.39
+## [4,] 0.49 0.49 0.49 1.00 0.60 0.49 0.49 0.49
+## [5,] 0.49 0.49 0.49 0.60 1.00 0.49 0.49 0.49
+## [6,] 0.39 0.39 0.39 0.49 0.49 1.00 0.60 0.60
+## [7,] 0.39 0.39 0.39 0.49 0.49 0.60 1.00 0.60
+## [8,] 0.39 0.39 0.39 0.49 0.49 0.60 0.60 1.00
And here is code to generate the empirical correlation matrices for
the three sites, based on 5000 replications of the data:
-<- lapply(1:5000,
- reps function(x) addCorGen(dd, idvar = "site", corMatrix = R,
- dist = "poisson", param1 = "lambda", cnames = "y")
-
- )
-<- data.table::rbindlist(reps, idcol = "rep")
- drep
-<- function(cluster) {
- empir_corr <- drep[site == cluster, ]
- dcrep := 1:.N, keyby = rep]
- dcrep[, seq <- as.matrix(dcast(dcrep, rep ~ seq, value.var = "y")[, -1])
- dmat
- return(round(cor(dmat), 2))
-
- }
-
-empir_corr(cluster = 1)
-empir_corr(cluster = 3)
-empir_corr(cluster = 7)
+reps <- lapply(1:5000,
+ function(x) addCorGen(dd, idvar = "site", corMatrix = R,
+ dist = "poisson", param1 = "lambda", cnames = "y")
+)
+
+drep <- data.table::rbindlist(reps, idcol = "rep")
+
+empir_corr <- function(cluster) {
+ dcrep <- drep[site == cluster, ]
+ dcrep[, seq := 1:.N, keyby = rep]
+ dmat <- as.matrix(dcast(dcrep, rep ~ seq, value.var = "y")[, -1])
+
+ return(round(cor(dmat), 2))
+}
+
+
+empir_corr(cluster = 1)
+empir_corr(cluster = 3)
+empir_corr(cluster = 7)
Reference:
Li, Fan, James P. Hughes, Karla Hemming, Monica Taljaard, Edward R.
diff --git a/inst/doc/correlated.R b/inst/doc/correlated.R
index 7bcd2ad..c45a4fd 100644
--- a/inst/doc/correlated.R
+++ b/inst/doc/correlated.R
@@ -1,4 +1,7 @@
-## ---- echo = FALSE, message = FALSE-------------------------------------------
+## ----chunkname, echo=-1-------------------------------------------------------
+data.table::setDTthreads(2)
+
+## ----echo = FALSE, message = FALSE--------------------------------------------
library(simstudy)
library(ggplot2)
library(scales)
@@ -28,7 +31,7 @@ ggtheme <- function(panelback = "white") {
}
-## ---- tidy=TRUE---------------------------------------------------------------
+## ----tidy=TRUE----------------------------------------------------------------
# specifying a specific correlation matrix C
C <- matrix(c(1,.7,.2, .7, 1, .8, .2, .8, 1),nrow = 3)
C
@@ -45,7 +48,7 @@ dt[,round(cor(cbind(V1, V2, V3)),1)]
# estimate standard deviation
dt[,round(sqrt(diag(var(cbind(V1, V2, V3)))),1)]
-## ---- tidy=TRUE---------------------------------------------------------------
+## ----tidy=TRUE----------------------------------------------------------------
# generate 3 correlated variables with different location but same standard deviation
# and compound symmetry (cs) correlation matrix with correlation coefficient = 0.4.
# Other correlation matrix structures are "independent" ("ind") and "auto-regressive" ("ar1").
@@ -60,7 +63,7 @@ dt[,round(cor(cbind(x0, x1, x2)),1)]
# estimate standard deviation
dt[,round(sqrt(diag(var(cbind(x0, x1, x2)))),1)]
-## ---- tidy = TRUE-------------------------------------------------------------
+## ----tidy = TRUE--------------------------------------------------------------
# define and generate the original data set
def <- defData(varname = "x", dist = "normal", formula = 0, variance = 1, id = "cid")
diff --git a/inst/doc/correlated.Rmd b/inst/doc/correlated.Rmd
index 9fd6ffe..5e0b48e 100644
--- a/inst/doc/correlated.Rmd
+++ b/inst/doc/correlated.Rmd
@@ -7,6 +7,10 @@ vignette: >
\usepackage[utf8]{inputenc}
---
+```{r chunkname, echo=-1}
+data.table::setDTthreads(2)
+```
+
```{r, echo = FALSE, message = FALSE}
library(simstudy)
library(ggplot2)
diff --git a/inst/doc/correlated.html b/inst/doc/correlated.html
index 12d76bb..9bad485 100644
--- a/inst/doc/correlated.html
+++ b/inst/doc/correlated.html
@@ -353,19 +353,19 @@
Correlated Data
corMatrix
or a correlation coefficient rho
and
a correlation structure corsrt
. Here are a few
examples:
-# specifying a specific correlation matrix C
-<- matrix(c(1, 0.7, 0.2, 0.7, 1, 0.8, 0.2, 0.8, 1), nrow = 3)
- C C
+# specifying a specific correlation matrix C
+C <- matrix(c(1, 0.7, 0.2, 0.7, 1, 0.8, 0.2, 0.8, 1), nrow = 3)
+C
## [,1] [,2] [,3]
## [1,] 1.0 0.7 0.2
## [2,] 0.7 1.0 0.8
## [3,] 0.2 0.8 1.0
-set.seed(282726)
-
-# generate 3 correlated variables with different location and scale for each
-# field
-<- genCorData(1000, mu = c(4, 12, 3), sigma = c(1, 2, 3), corMatrix = C)
- dt dt
+set.seed(282726)
+
+# generate 3 correlated variables with different location and scale for each
+# field
+dt <- genCorData(1000, mu = c(4, 12, 3), sigma = c(1, 2, 3), corMatrix = C)
+dt
## id V1 V2 V3
## 1: 1 4.125728 12.92567 3.328106
## 2: 2 4.712100 14.26502 8.876664
@@ -378,24 +378,24 @@ Correlated Data
## 998: 998 3.856643 13.17697 4.720628
## 999: 999 4.738479 12.64438 2.979415
## 1000: 1000 5.766867 13.51827 1.693172
-# estimate correlation matrix
-round(cor(cbind(V1, V2, V3)), 1)] dt[,
+
## V1 V2 V3
## V1 1.0 0.7 0.2
## V2 0.7 1.0 0.8
## V3 0.2 0.8 1.0
-# estimate standard deviation
-round(sqrt(diag(var(cbind(V1, V2, V3)))), 1)] dt[,
+
## V1 V2 V3
## 0.9 1.9 3.0
-# generate 3 correlated variables with different location but same standard
-# deviation and compound symmetry (cs) correlation matrix with correlation
-# coefficient = 0.4. Other correlation matrix structures are 'independent'
-# ('ind') and 'auto-regressive' ('ar1').
-
-<- genCorData(1000, mu = c(4, 12, 3), sigma = 3, rho = 0.4, corstr = "cs", cnames = c("x0",
- dt "x1", "x2"))
- dt
+# generate 3 correlated variables with different location but same standard
+# deviation and compound symmetry (cs) correlation matrix with correlation
+# coefficient = 0.4. Other correlation matrix structures are 'independent'
+# ('ind') and 'auto-regressive' ('ar1').
+
+dt <- genCorData(1000, mu = c(4, 12, 3), sigma = 3, rho = 0.4, corstr = "cs", cnames = c("x0",
+ "x1", "x2"))
+dt
## id x0 x1 x2
## 1: 1 7.1160161 14.294748 4.0251237
## 2: 2 3.5429823 8.299333 4.5620657
@@ -408,28 +408,28 @@ Correlated Data
## 998: 998 3.3079075 11.909745 -0.7375013
## 999: 999 3.7934154 10.515881 2.6021325
## 1000: 1000 5.6413141 13.513672 7.5321371
-# estimate correlation matrix
-round(cor(cbind(x0, x1, x2)), 1)] dt[,
+
## x0 x1 x2
## x0 1.0 0.5 0.4
## x1 0.5 1.0 0.4
## x2 0.4 0.4 1.0
-# estimate standard deviation
-round(sqrt(diag(var(cbind(x0, x1, x2)))), 1)] dt[,
+
## x0 x1 x2
## 2.9 3.0 3.0
The new data generated by genCorData
can be merged with
an existing data set. Alternatively, addCorData
will do
this directly:
-# define and generate the original data set
-<- defData(varname = "x", dist = "normal", formula = 0, variance = 1, id = "cid")
- def <- genData(1000, def)
- dt
-# add new correlate fields a0 and a1 to 'dt'
-<- addCorData(dt, idname = "cid", mu = c(0, 0), sigma = c(2, 0.2), rho = -0.2,
- dt corstr = "cs", cnames = c("a0", "a1"))
-
- dt
+# define and generate the original data set
+def <- defData(varname = "x", dist = "normal", formula = 0, variance = 1, id = "cid")
+dt <- genData(1000, def)
+
+# add new correlate fields a0 and a1 to 'dt'
+dt <- addCorData(dt, idname = "cid", mu = c(0, 0), sigma = c(2, 0.2), rho = -0.2,
+ corstr = "cs", cnames = c("a0", "a1"))
+
+dt
## cid x a0 a1
## 1: 1 -0.4707940 0.97711194 -0.09127123
## 2: 2 -1.8723668 2.70498417 -0.27102780
@@ -442,13 +442,13 @@ Correlated Data
## 998: 998 0.1015744 -0.09821567 0.06440723
## 999: 999 -0.5788317 0.16870967 -0.03890117
## 1000: 1000 -1.6175613 3.61182553 -0.46220263
-# estimate correlation matrix
-round(cor(cbind(a0, a1)), 1)] dt[,
+
## a0 a1
## a0 1.0 -0.2
## a1 -0.2 1.0
-# estimate standard deviation
-round(sqrt(diag(var(cbind(a0, a1)))), 1)] dt[,
+
## a0 a1
## 2.0 0.2
diff --git a/inst/doc/double_dot_extension.R b/inst/doc/double_dot_extension.R
index 4c7f763..8b8fcf7 100644
--- a/inst/doc/double_dot_extension.R
+++ b/inst/doc/double_dot_extension.R
@@ -1,4 +1,7 @@
-## ---- echo = FALSE, message = FALSE-------------------------------------------
+## ----chunkname, echo=-1-------------------------------------------------------
+data.table::setDTthreads(2)
+
+## ----echo = FALSE, message = FALSE--------------------------------------------
options(digits = 3)
library(simstudy)
@@ -81,7 +84,7 @@ fit <- summary(lm(y ~ x, data = dd))
coef(fit)
fit$sigma
-## ---- fig.width = 5-----------------------------------------------------------
+## ----fig.width = 5------------------------------------------------------------
sigma2s <- c(1, 2, 6, 9)
gen_data <- function(sigma2, d) {
@@ -138,20 +141,20 @@ d1 <- defData(d1, varname = "b", formula = ".5;.5", variance = "1;2", dist = "ca
d1 <- defData(d1, varname = "outcome", formula = "..effect[a, b]", dist="nonrandom")
## -----------------------------------------------------------------------------
-dx <- genData(8, d1)
+dx <- genData(1000, d1)
dx
## -----------------------------------------------------------------------------
d1 <- updateDef(d1, "outcome", newvariance = 9, newdist = "normal")
dx <- genData(1000, d1)
-## ---- echo=FALSE--------------------------------------------------------------
+## ----echo=FALSE---------------------------------------------------------------
dsum <- dx[, .(outcome=mean(outcome)), keyby = .(a, b)]
ggplot(data = dx, aes(x = factor(a), y = outcome)) +
geom_jitter(aes(color = factor(b)), width = .2, alpha = .4, size = .2) +
geom_point(data = dsum, size = 2, aes(color = factor(b))) +
- geom_line(data = dsum, size = 1, aes(color = factor(b), group = factor(b))) +
+ geom_line(data = dsum, linewidth = 1, aes(color = factor(b), group = factor(b))) +
scale_color_manual(values = cbbPalette, name = " b") +
theme(panel.grid = element_blank()) +
xlab ("a")
diff --git a/inst/doc/double_dot_extension.Rmd b/inst/doc/double_dot_extension.Rmd
index 32a1c82..0ce7df5 100644
--- a/inst/doc/double_dot_extension.Rmd
+++ b/inst/doc/double_dot_extension.Rmd
@@ -7,6 +7,10 @@ vignette: >
\usepackage[utf8]{inputenc}
---
+```{r chunkname, echo=-1}
+data.table::setDTthreads(2)
+```
+
```{r, echo = FALSE, message = FALSE}
options(digits = 3)
@@ -196,7 +200,7 @@ d1 <- defData(d1, varname = "outcome", formula = "..effect[a, b]", dist="nonrand
```
```{r}
-dx <- genData(8, d1)
+dx <- genData(1000, d1)
dx
```
@@ -215,7 +219,7 @@ dsum <- dx[, .(outcome=mean(outcome)), keyby = .(a, b)]
ggplot(data = dx, aes(x = factor(a), y = outcome)) +
geom_jitter(aes(color = factor(b)), width = .2, alpha = .4, size = .2) +
geom_point(data = dsum, size = 2, aes(color = factor(b))) +
- geom_line(data = dsum, size = 1, aes(color = factor(b), group = factor(b))) +
+ geom_line(data = dsum, linewidth = 1, aes(color = factor(b), group = factor(b))) +
scale_color_manual(values = cbbPalette, name = " b") +
theme(panel.grid = element_blank()) +
xlab ("a")
diff --git a/inst/doc/double_dot_extension.html b/inst/doc/double_dot_extension.html
index f0c4ca6..6c4f362 100644
--- a/inst/doc/double_dot_extension.html
+++ b/inst/doc/double_dot_extension.html
@@ -360,19 +360,19 @@ Updating existing definition tables
The original data set definition includes three variables
x
, y
, and z
, all normally
distributed:
-<- defData(varname = "x", formula = 0, variance = 3, dist = "normal")
- defs <- defData(defs, varname = "y", formula = "2 + 3*x", variance = 1, dist = "normal")
- defs <- defData(defs, varname = "z", formula = "4 + 3*x - 2*y", variance = 1, dist = "normal")
- defs
- defs
+defs <- defData(varname = "x", formula = 0, variance = 3, dist = "normal")
+defs <- defData(defs, varname = "y", formula = "2 + 3*x", variance = 1, dist = "normal")
+defs <- defData(defs, varname = "z", formula = "4 + 3*x - 2*y", variance = 1, dist = "normal")
+
+defs
## varname formula variance dist link
## 1: x 0 3 normal identity
## 2: y 2 + 3*x 1 normal identity
## 3: z 4 + 3*x - 2*y 1 normal identity
In the first case, we are changing the relationship of y
with x
as well as the variance:
-<- updateDef(dtDefs = defs, changevar = "y", newformula = "x + 5", newvariance = 2)
- defs defs
+
## varname formula variance dist link
## 1: x 0 3 normal identity
## 2: y x + 5 2 normal identity
@@ -380,8 +380,8 @@ Updating existing definition tables
In this second case, we are changing the distribution of
z
to Poisson and updating the link function to
log:
-<- updateDef(dtDefs = defs, changevar = "z", newdist = "poisson", newlink = "log")
- defs defs
+
## varname formula variance dist link
## 1: x 0 3 normal identity
## 2: y x + 5 2 normal identity
@@ -391,8 +391,8 @@ Updating existing definition tables
defData
that it is not possible to remove a variable that
is a predictor of a subsequent variable, such as x
or
y
in this case.
-<- updateDef(dtDefs = defs, changevar = "z", remove = TRUE)
- defs defs
+
## varname formula variance dist link
## 1: x 0 3 normal identity
## 2: y x + 5 2 normal identity
@@ -405,63 +405,63 @@ Double-dot external variable reference
of hyperparameter of the data generation process. The reference is made
directly in the formula itself, using a double-dot (“..”) notation
before the variable name. Here is a simple example:
-<- defData(varname = "x", formula = 0,
- def variance = 5, dist = "normal")
- <- defData(def, varname = "y", formula = "..B0 + ..B1 * x",
- def variance = "..sigma2", dist = "normal")
-
- def
+def <- defData(varname = "x", formula = 0,
+ variance = 5, dist = "normal")
+def <- defData(def, varname = "y", formula = "..B0 + ..B1 * x",
+ variance = "..sigma2", dist = "normal")
+
+def
## varname formula variance dist link
## 1: x 0 5 normal identity
## 2: y ..B0 + ..B1 * x ..sigma2 normal identity
-<- 4;
- B0 <- 2;
- B1 <- 9
- sigma2
-set.seed(716251)
-
-<- genData(100, def)
- dd
-<- summary(lm(y ~ x, data = dd))
- fit
-coef(fit)
+B0 <- 4;
+B1 <- 2;
+sigma2 <- 9
+
+set.seed(716251)
+
+dd <- genData(100, def)
+
+fit <- summary(lm(y ~ x, data = dd))
+
+coef(fit)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.00 0.284 14.1 2.56e-25
## x 2.01 0.130 15.4 5.90e-28
-$sigma fit
+
## [1] 2.83
It is easy to create a new data set on the fly with a difference
variance assumption without having to go to the trouble of updating the
data definitions.
-<- 16
- sigma2
-<- genData(100, def)
- dd <- summary(lm(y ~ x, data = dd))
- fit
-coef(fit)
+
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.35 0.427 10.19 4.57e-17
## x 2.12 0.218 9.75 4.32e-16
-$sigma fit
+
## [1] 4.21
The double-dot notation can be flexibly applied using
lapply
(or the parallel version mclapply
) to
create a range of data sets under different assumptions:
-<- c(1, 2, 6, 9)
- sigma2s
-<- function(sigma2, d) {
- gen_data <- genData(200, d)
- dd $sigma2 <- sigma2
- dd
- dd
- }
-<- lapply(sigma2s, function(s) gen_data(s, def))
- dd_4 <- rbindlist(dd_4)
- dd_4
-ggplot(data = dd_4, aes(x = x, y = y)) +
-geom_point(size = .5, color = "grey30") +
- facet_wrap(sigma2 ~ .) +
- theme(panel.grid = element_blank())
+sigma2s <- c(1, 2, 6, 9)
+
+gen_data <- function(sigma2, d) {
+ dd <- genData(200, d)
+ dd$sigma2 <- sigma2
+ dd
+}
+
+dd_4 <- lapply(sigma2s, function(s) gen_data(s, def))
+dd_4 <- rbindlist(dd_4)
+
+ggplot(data = dd_4, aes(x = x, y = y)) +
+ geom_point(size = .5, color = "grey30") +
+ facet_wrap(sigma2 ~ .) +
+ theme(panel.grid = element_blank())
@@ -471,14 +471,14 @@ Using non-scalar double-dot variable reference
(which we can also do using a categorical distribution), we can
define the mixture formula in terms of the vector. In this case we are
generating permuted block sizes of 2 and 4:
-<- defData(varname = "blksize",
- defblk formula = "..sizes[1] | .5 + ..sizes[2] | .5", dist = "mixture")
-
- defblk
+defblk <- defData(varname = "blksize",
+ formula = "..sizes[1] | .5 + ..sizes[2] | .5", dist = "mixture")
+
+defblk
## varname formula variance dist link
## 1: blksize ..sizes[1] | .5 + ..sizes[2] | .5 0 mixture identity
-<- c(2, 4)
- sizes genData(1000, defblk)
+
## id blksize
## 1: 1 4
## 2: 2 4
@@ -496,18 +496,18 @@ Using non-scalar double-dot variable reference
weighted average of three numbers using tau as the weights. We
could use the following code to estimate a weighted average
theta:
-<- rgamma(3, 5, 2)
- tau <- tau / sum(tau)
- tau tau
+
## [1] 0.319 0.550 0.132
-<- defData(varname = "a", formula = 3, variance = 4)
- d <- defData(d, varname = "b", formula = 8, variance = 2)
- d <- defData(d, varname = "c", formula = 11, variance = 6)
- d <- defData(d, varname = "theta", formula = "..tau[1]*a + ..tau[2]*b + ..tau[3]*c",
- d dist = "nonrandom")
-
-set.seed(1)
-genData(4, d)
+d <- defData(varname = "a", formula = 3, variance = 4)
+d <- defData(d, varname = "b", formula = 8, variance = 2)
+d <- defData(d, varname = "c", formula = 11, variance = 6)
+d <- defData(d, varname = "theta", formula = "..tau[1]*a + ..tau[2]*b + ..tau[3]*c",
+ dist = "nonrandom")
+
+set.seed(1)
+genData(4, d)
## id a b c theta
## 1: 1 1.75 8.47 12.4 6.84
## 2: 2 3.37 6.84 10.3 6.18
@@ -515,10 +515,10 @@ Using non-scalar double-dot variable reference
## 4: 4 6.19 9.04 12.0 8.52
We can simplify the calculation of theta by using matrix
multiplication:
-<- updateDef(d, changevar = "theta", newformula = "t(..tau) %*% c(a, b, c)")
- d
-set.seed(1)
-genData(4, d)
+d <- updateDef(d, changevar = "theta", newformula = "t(..tau) %*% c(a, b, c)")
+
+set.seed(1)
+genData(4, d)
## id a b c theta
## 1: 1 1.75 8.47 12.4 6.84
## 2: 2 3.37 6.84 10.3 6.18
@@ -530,39 +530,37 @@ Using non-scalar double-dot variable reference
interventions \(a\) and \(b\), we can use a simple matrix and draw
the means directly from the matrix, which in this example is stored in
the variable effect:
-<- matrix(c(0, 4, 5, 7), nrow = 2)
- effect effect
+
## [,1] [,2]
## [1,] 0 5
## [2,] 4 7
Using double dot notation, it is possible to reference the matrix
cell values directly:
-<- defData(varname = "a", formula = ".5;.5", variance = "1;2", dist = "categorical")
- d1 <- defData(d1, varname = "b", formula = ".5;.5", variance = "1;2", dist = "categorical")
- d1 <- defData(d1, varname = "outcome", formula = "..effect[a, b]", dist="nonrandom") d1
-<- genData(8, d1)
- dx dx
-## id a b outcome
-## 1: 1 2 2 7
-## 2: 2 2 2 7
-## 3: 3 2 1 4
-## 4: 4 2 1 4
-## 5: 5 1 1 0
-## 6: 6 2 2 7
-## 7: 7 2 1 4
-## 8: 8 1 2 5
+d1 <- defData(varname = "a", formula = ".5;.5", variance = "1;2", dist = "categorical")
+d1 <- defData(d1, varname = "b", formula = ".5;.5", variance = "1;2", dist = "categorical")
+d1 <- defData(d1, varname = "outcome", formula = "..effect[a, b]", dist="nonrandom")
+
+## id a b outcome
+## 1: 1 2 2 7
+## 2: 2 2 1 4
+## 3: 3 2 1 4
+## 4: 4 2 2 7
+## 5: 5 1 2 5
+## ---
+## 996: 996 2 2 7
+## 997: 997 2 2 7
+## 998: 998 1 1 0
+## 999: 999 2 2 7
+## 1000: 1000 1 2 5
It is possible to generate normally distributed data based on these
means:
-<- updateDef(d1, "outcome", newvariance = 9, newdist = "normal")
- d1 <- genData(1000, d1) dx
+
The plot shows the individual values as well as the mean values by
intervention arm:
-## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
-## ℹ Please use `linewidth` instead.
-## This warning is displayed once every 8 hours.
-## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
-## generated.
-
+
diff --git a/inst/doc/logisticCoefs.R b/inst/doc/logisticCoefs.R
new file mode 100644
index 0000000..b5da08f
--- /dev/null
+++ b/inst/doc/logisticCoefs.R
@@ -0,0 +1,140 @@
+## ----chunkname, echo=-1-------------------------------------------------------
+data.table::setDTthreads(2)
+
+## ----echo = FALSE, message = FALSE--------------------------------------------
+library(simstudy)
+library(ggplot2)
+library(scales)
+library(grid)
+library(gridExtra)
+library(data.table)
+
+options(digits = 2)
+
+## -----------------------------------------------------------------------------
+library(simstudy)
+library(ggplot2)
+library(data.table)
+
+coefs1 <- c(0.15, 0.25, 0.10, 0.30)
+
+d1 <- defData(varname = "x1", formula = 0, variance = 1)
+d1 <- defData(d1, varname = "x2", formula = 0, variance = 1)
+d1 <- defData(d1, varname = "b1", formula = 0.3, dist = "binary")
+d1 <- defData(d1, varname = "b2", formula = 0.7, dist = "binary")
+
+d1a <- defData(d1, varname = "y",
+ formula = "t(..coefs1) %*% c(x1, x2, b1, b2)",
+ dist = "binary", link = "logit")
+
+set.seed(48392)
+
+dd <- genData(500000, d1a)
+dd
+
+## -----------------------------------------------------------------------------
+dd[, mean(y)]
+
+## -----------------------------------------------------------------------------
+d1a <- defData(d1, varname = "y",
+ formula = "t(c(-0.66, ..coefs1)) %*% c(1, x1, x2, b1, b2)",
+ dist = "binary", link = "logit")
+
+genData(500000, d1a)[, mean(y)]
+
+## -----------------------------------------------------------------------------
+coefs2 <- c(0.20, 0.35, 0.20, 0.45)
+
+d2 <- defData(varname = "x1", formula = 1, variance = 1)
+d2 <- defData(d2, varname = "x2", formula = 3, variance = 1)
+d2 <- defData(d2, varname = "b1", formula = 0.5, dist = "binary")
+d2 <- defData(d2, varname = "b2", formula = 0.8, dist = "binary")
+
+d2a <- defData(d2, varname = "y",
+ formula = "t(..coefs2) %*% c(x1, x2, b1, b2)",
+ dist = "binary", link = "logit")
+
+genData(500000, d2a)[, mean(y)]
+
+## -----------------------------------------------------------------------------
+d2a <- defData(d2, varname = "y",
+ formula = "t(c(-2.13, ..coefs2)) %*% c(1, x1, x2, b1, b2)",
+ dist = "binary", link = "logit")
+
+genData(500000, d1a)[, mean(y)]
+
+## -----------------------------------------------------------------------------
+logisticCoefs(defCovar = d1, coefs = coefs1, popPrev = 0.40)
+logisticCoefs(defCovar = d2, coefs = coefs2, popPrev = 0.40)
+
+## -----------------------------------------------------------------------------
+d1a <- defData(d1, varname = "rx", formula = "1;1", dist = "trtAssign")
+d1a <- defData(d1a, varname = "y",
+ formula = "t(c(-0.40, ..coefs1)) %*% c(rx, x1, x2, b1, b2)",
+ dist = "binary", link = "logit"
+)
+
+dd <- genData(500000, d1a)
+dd[rx==1, mean(y)]/dd[rx==0, mean(y)]
+
+## -----------------------------------------------------------------------------
+d2a <- defData(d2, varname = "rx", formula = "1;1", dist = "trtAssign")
+d2a <- defData(d2a, varname = "y",
+ formula = "t(c(-0.40, ..coefs2)) %*% c(rx, x1, x2, b1, b2)",
+ dist = "binary", link = "logit"
+)
+
+dd <- genData(500000, d2a)
+dd[rx==1, mean(y)]/dd[rx==0, mean(y)]
+
+## -----------------------------------------------------------------------------
+C1 <- logisticCoefs(d1, coefs1, popPrev = 0.40, rr = 0.85)
+C1
+
+## -----------------------------------------------------------------------------
+d1a <- defData(d1, varname = "rx", formula = "1;1", dist = "trtAssign")
+d1a <- defData(d1a, varname = "y",
+ formula = "t(..C1) %*% c(1, rx, x1, x2, b1, b2)",
+ dist = "binary", link = "logit"
+)
+
+dd <- genData(500000, d1a)
+
+## -----------------------------------------------------------------------------
+dd[rx==0, mean(y)]
+dd[rx==1, mean(y)]/dd[rx==0, mean(y)]
+
+## -----------------------------------------------------------------------------
+C1 <- logisticCoefs(d1, coefs1, popPrev = 0.40, rd = -0.15)
+C1
+
+## -----------------------------------------------------------------------------
+d1a <- defData(d1, varname = "rx", formula = "1;1", dist = "trtAssign")
+d1a <- defData(d1a, varname = "y",
+ formula = "t(..C1) %*% c(1, rx, x1, x2, b1, b2)",
+ dist = "binary", link = "logit"
+)
+
+dd <- genData(500000, d1a)
+
+dd[rx==0, mean(y)]
+dd[rx==1, mean(y)] - dd[rx==0, mean(y)]
+
+## -----------------------------------------------------------------------------
+C1 <- logisticCoefs(d1, coefs1, popPrev = 0.40, auc = 0.85)
+C1
+
+## -----------------------------------------------------------------------------
+d1a <- defData(d1, varname = "y",
+ formula = "t(..C1) %*% c(1, x1, x2, b1, b2)",
+ dist = "binary", link = "logit"
+)
+
+dd <- genData(500000, d1a)
+
+dd[, mean(y)]
+
+fit <- rms::lrm(y ~ x1 + x2 + b1 + b2, data = dd)
+fit$stats["C"]
+
+
diff --git a/inst/doc/logisticCoefs.Rmd b/inst/doc/logisticCoefs.Rmd
new file mode 100644
index 0000000..7e8c843
--- /dev/null
+++ b/inst/doc/logisticCoefs.Rmd
@@ -0,0 +1,248 @@
+---
+title: "Targeted logistic model coefficients"
+output: rmarkdown::html_vignette
+vignette: >
+ %\VignetteIndexEntry{Logistic Model Simulation}
+ %\VignetteEngine{knitr::rmarkdown}
+ \usepackage[utf8]{inputenc}
+---
+
+```{r chunkname, echo=-1}
+data.table::setDTthreads(2)
+```
+
+```{r, echo = FALSE, message = FALSE}
+library(simstudy)
+library(ggplot2)
+library(scales)
+library(grid)
+library(gridExtra)
+library(data.table)
+
+options(digits = 2)
+```
+
+In `simstudy`, there are at least two ways to define a binary data generating process. The first is to operate on the scale of the proportion or probability using the *identity* link. This allows users to define a data generating process that reflects assumptions about risk ratios and risk differences when comparing two groups defined by an exposure or treatment. However, this process can become challenging when introducing other covariates, because it can be difficult to constrain the probabilities so that they fall between 0 and 1.
+
+The second approach works on the log-odds scale using a *logit* link, and is much more amenable to accommodating covariates. Unfortunately, this comes at the price of being able to easily generate specific risk ratios and risk differences, because all parameters are log-odds ratios. The overall (marginal) prevalence of an outcome in a population will vary depending on the distribution of covariates in that population, and the strengths (both absolute and relative) of the association of those covariates with the outcome. That is, the coefficients of a logistic model (including the intercept) determine the prevalence. The same is true regarding the risk ratio and risk difference (if there is one particular exposure or treatment of interest) and the AUC.
+
+Here we start with the simplest case where we have a target marginal proportion or prevalence, and then illustrate data generation with three other target statistics: **risk ratios**, **risk differences**, and **AUCs**.
+
+### Prevalence
+
+In this first example, we start with one set of assumptions for four covariates $x_1, x2 \sim N(0, 1)$, $b_1 \sim Bin(0.3)$, and $b_2 \sim Bin(0.7)$, and generate the outcome *y* with the following data generating process:
+
+$$ \text{logit}(y) = 0.15x_1 + 0.25x_2 + 0.10b_1 + 0.30b_2$$
+
+
+```{r}
+library(simstudy)
+library(ggplot2)
+library(data.table)
+
+coefs1 <- c(0.15, 0.25, 0.10, 0.30)
+
+d1 <- defData(varname = "x1", formula = 0, variance = 1)
+d1 <- defData(d1, varname = "x2", formula = 0, variance = 1)
+d1 <- defData(d1, varname = "b1", formula = 0.3, dist = "binary")
+d1 <- defData(d1, varname = "b2", formula = 0.7, dist = "binary")
+
+d1a <- defData(d1, varname = "y",
+ formula = "t(..coefs1) %*% c(x1, x2, b1, b2)",
+ dist = "binary", link = "logit")
+
+set.seed(48392)
+
+dd <- genData(500000, d1a)
+dd
+```
+
+The overall proportion of $y=1$ in this case is
+
+```{r}
+dd[, mean(y)]
+```
+
+If we have a desired marginal proportion of 0.40, then we can add an intercept of -0.66 to the data generating process:
+
+$$ \text{logit}(y) = -0.66 + 0.15x_1 + 0.25x_2 + 0.10b_1 + 0.30b_2$$
+
+The simulation now gives us the desired target:
+
+```{r}
+d1a <- defData(d1, varname = "y",
+ formula = "t(c(-0.66, ..coefs1)) %*% c(1, x1, x2, b1, b2)",
+ dist = "binary", link = "logit")
+
+genData(500000, d1a)[, mean(y)]
+```
+
+If we change the distribution of the covariates, so that $x_1 \sim N(1, 1)$, $x_2 \sim N(2, 1)$, $b_1 \sim Bin(0.5)$, and $b_2 \sim Bin(0.8)$, and the strength of the association of these covariates with the outcome so that
+
+$$ \text{logit}(y) = 0.20x_1 + 0.35x_2 + 0.20b_1 + 0.45b_2,$$
+
+the marginal proportion/prevalence (assuming no intercept term) also changes, going from 0.56 to 0.84:
+
+```{r}
+coefs2 <- c(0.20, 0.35, 0.20, 0.45)
+
+d2 <- defData(varname = "x1", formula = 1, variance = 1)
+d2 <- defData(d2, varname = "x2", formula = 3, variance = 1)
+d2 <- defData(d2, varname = "b1", formula = 0.5, dist = "binary")
+d2 <- defData(d2, varname = "b2", formula = 0.8, dist = "binary")
+
+d2a <- defData(d2, varname = "y",
+ formula = "t(..coefs2) %*% c(x1, x2, b1, b2)",
+ dist = "binary", link = "logit")
+
+genData(500000, d2a)[, mean(y)]
+```
+
+But under this new distribution, adding an intercept of -2.13 yields the desired target.
+
+$$ \text{logit}(y) = -2.13 + 0.20x_1 + 0.35x_2 + 0.20b_1 + 0.45b_2 $$
+
+
+
+```{r}
+d2a <- defData(d2, varname = "y",
+ formula = "t(c(-2.13, ..coefs2)) %*% c(1, x1, x2, b1, b2)",
+ dist = "binary", link = "logit")
+
+genData(500000, d1a)[, mean(y)]
+```
+
+#### Finding the intercept
+
+Where did those two intercepts come from? The [paper](https://bmcmedresmethodol.biomedcentral.com/articles/10.1186/s12874-023-01836-5){target="_blank"} by Peter Austin describes an iterative bisection procedure that takes a distribution of covariates and a set of coefficients to identify the intercept coefficient that yields the target marginal proportion or prevalence.
+
+The general idea of the algorithm is to try out series of different intercepts in an intelligent way that ends up at the right spot. (If you want the details for the algorithm, take a look at the [paper](https://bmcmedresmethodol.biomedcentral.com/articles/10.1186/s12874-023-01836-5){target="_blank"}.) The starting search range is pre-defined (we've used -10 to 10 for the intercept), and we start with an value of 0 for the initial intercept and simulate a large data set (the paper uses 1 million observations, but 100,000 seems to work just fine) and record the population prevalence. If we've overshot the target prevalence, we turn our attention to the range between -10 and 0, taking the average, which is -5. Otherwise, we focus on the range between 0 and 10. We iterate this way, choosing the range we need to focus on and setting the intercept at the mid-point (hence the name *bisection*). The algorithm will converge pretty quickly on the value of the intercept that gives the target population prevalence for the underlying covariate distribution and coefficient assumptions.
+
+In the current implementation in `simstudy`, the intercept is provided by a simple call to `logisticCoefs`. Here are the calls for the two sets of definitions in definition tables *d1* and *d2*.
+
+```{r}
+logisticCoefs(defCovar = d1, coefs = coefs1, popPrev = 0.40)
+logisticCoefs(defCovar = d2, coefs = coefs2, popPrev = 0.40)
+```
+
+### Risk ratios
+
+Just as the prevalence depends on the distribution of covariates and their association with the outcome, risk ratios comparing the outcome probabilities for two groups also depend on the additional covariates. The marginal risk ratio comparing treatment ($A =1$ to control ($A=0$) (given the distribution of covariates) is
+
+$$RR = \frac{P(y=1 | A = 1)}{P(y=1 | A = 0)}$$
+In the data generation process we use a log-odds ratio of -0.40 (odds ratio of approximately 0.67) in both cases, but we get different risk ratios (0.82 vs. 0.93), depending on the covariates (defined in *d1* and *d2*).
+
+```{r}
+d1a <- defData(d1, varname = "rx", formula = "1;1", dist = "trtAssign")
+d1a <- defData(d1a, varname = "y",
+ formula = "t(c(-0.40, ..coefs1)) %*% c(rx, x1, x2, b1, b2)",
+ dist = "binary", link = "logit"
+)
+
+dd <- genData(500000, d1a)
+dd[rx==1, mean(y)]/dd[rx==0, mean(y)]
+```
+
+```{r}
+d2a <- defData(d2, varname = "rx", formula = "1;1", dist = "trtAssign")
+d2a <- defData(d2a, varname = "y",
+ formula = "t(c(-0.40, ..coefs2)) %*% c(rx, x1, x2, b1, b2)",
+ dist = "binary", link = "logit"
+)
+
+dd <- genData(500000, d2a)
+dd[rx==1, mean(y)]/dd[rx==0, mean(y)]
+```
+
+By specifying both a population prevalence and a target risk ratio in the call to `logisticCoefs`, we can get the necessary parameters. When specifying the target risk ratio, it is required to be between 0 and 1/popPrev. A risk ratio cannot be negative, and the probability of the outcome under treatment cannot exceed 1 (which will happen if the risk ratio is greater than 1/popPrev).
+
+```{r}
+C1 <- logisticCoefs(d1, coefs1, popPrev = 0.40, rr = 0.85)
+C1
+```
+
+If we use $C_1$ in the data generation process, we will get a data set with the desired target prevalence and risk ratio:
+
+```{r}
+d1a <- defData(d1, varname = "rx", formula = "1;1", dist = "trtAssign")
+d1a <- defData(d1a, varname = "y",
+ formula = "t(..C1) %*% c(1, rx, x1, x2, b1, b2)",
+ dist = "binary", link = "logit"
+)
+
+dd <- genData(500000, d1a)
+```
+
+Here are the prevalence and risk ratio:
+
+```{r}
+dd[rx==0, mean(y)]
+dd[rx==1, mean(y)]/dd[rx==0, mean(y)]
+```
+
+You can do the same for the second set of assumptions.
+
+### Risk differences
+
+Risk differences have the same set of issues, and are handled in the same way. The risk difference is defined as
+
+$$ RD = P(y=1 | A = 1) - P(y=1 | A = 0)$$
+
+To get the coefficients related to a population prevalence of 0.40 and risk difference of -0.15 (so that the proportion in the exposure arm is 0.25), we use the *rd* argument:
+
+```{r}
+C1 <- logisticCoefs(d1, coefs1, popPrev = 0.40, rd = -0.15)
+C1
+```
+
+Again, using $C_1$ in the data generation process, we will get a data set with the desired target prevalence and risk difference:
+
+```{r}
+d1a <- defData(d1, varname = "rx", formula = "1;1", dist = "trtAssign")
+d1a <- defData(d1a, varname = "y",
+ formula = "t(..C1) %*% c(1, rx, x1, x2, b1, b2)",
+ dist = "binary", link = "logit"
+)
+
+dd <- genData(500000, d1a)
+
+dd[rx==0, mean(y)]
+dd[rx==1, mean(y)] - dd[rx==0, mean(y)]
+```
+
+### AUC
+
+The AUC is another commonly used statistic to evaluate a logistic model. We can use `logisticCoefs` to find the parameters that will allow us to generate data from a model with a specific AUC. To get the coefficients related to a population prevalence of 0.40 and an AUC of 0.85, we use the *auc* argument (which must be between 0.5 and 1):
+
+```{r}
+C1 <- logisticCoefs(d1, coefs1, popPrev = 0.40, auc = 0.85)
+C1
+```
+
+Again, using $C_1$ in the data generation process, we will get a data set with the desired target prevalence and the AUC (calculated here using the `lrm` function in the `rms` package:
+
+```{r}
+d1a <- defData(d1, varname = "y",
+ formula = "t(..C1) %*% c(1, x1, x2, b1, b2)",
+ dist = "binary", link = "logit"
+)
+
+dd <- genData(500000, d1a)
+
+dd[, mean(y)]
+
+fit <- rms::lrm(y ~ x1 + x2 + b1 + b2, data = dd)
+fit$stats["C"]
+
+```
+
+
+
+References:
+
+Austin, Peter C. "The iterative bisection procedure: a useful
+tool for determining parameter values in data-generating processes in
+Monte Carlo simulations." BMC Medical Research Methodology 23,
+no. 1 (2023): 1-10.
+
+
\ No newline at end of file
diff --git a/inst/doc/logisticCoefs.html b/inst/doc/logisticCoefs.html
new file mode 100644
index 0000000..d964bd6
--- /dev/null
+++ b/inst/doc/logisticCoefs.html
@@ -0,0 +1,625 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Targeted logistic model coefficients
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Targeted logistic model coefficients
+
+
+
+In simstudy
, there are at least two ways to define a
+binary data generating process. The first is to operate on the scale of
+the proportion or probability using the identity link. This
+allows users to define a data generating process that reflects
+assumptions about risk ratios and risk differences when comparing two
+groups defined by an exposure or treatment. However, this process can
+become challenging when introducing other covariates, because it can be
+difficult to constrain the probabilities so that they fall between 0 and
+1.
+The second approach works on the log-odds scale using a
+logit link, and is much more amenable to accommodating
+covariates. Unfortunately, this comes at the price of being able to
+easily generate specific risk ratios and risk differences, because all
+parameters are log-odds ratios. The overall (marginal) prevalence of an
+outcome in a population will vary depending on the distribution of
+covariates in that population, and the strengths (both absolute and
+relative) of the association of those covariates with the outcome. That
+is, the coefficients of a logistic model (including the intercept)
+determine the prevalence. The same is true regarding the risk ratio and
+risk difference (if there is one particular exposure or treatment of
+interest) and the AUC.
+Here we start with the simplest case where we have a target marginal
+proportion or prevalence, and then illustrate data generation with three
+other target statistics: risk ratios, risk
+differences, and AUCs.
+
+Prevalence
+In this first example, we start with one set of assumptions for four
+covariates \(x_1, x2 \sim N(0, 1)\),
+\(b_1 \sim Bin(0.3)\), and \(b_2 \sim Bin(0.7)\), and generate the
+outcome y with the following data generating process:
+\[ \text{logit}(y) = 0.15x_1 + 0.25x_2 +
+0.10b_1 + 0.30b_2\]
+library(simstudy)
+library(ggplot2)
+library(data.table)
+
+coefs1 <- c(0.15, 0.25, 0.10, 0.30)
+
+d1 <- defData(varname = "x1", formula = 0, variance = 1)
+d1 <- defData(d1, varname = "x2", formula = 0, variance = 1)
+d1 <- defData(d1, varname = "b1", formula = 0.3, dist = "binary")
+d1 <- defData(d1, varname = "b2", formula = 0.7, dist = "binary")
+
+d1a <- defData(d1, varname = "y",
+ formula = "t(..coefs1) %*% c(x1, x2, b1, b2)",
+ dist = "binary", link = "logit")
+
+set.seed(48392)
+
+dd <- genData(500000, d1a)
+dd
+## id x1 x2 b1 b2 y
+## 1: 1 0.29 0.390 0 1 1
+## 2: 2 0.76 -0.925 0 0 0
+## 3: 3 -1.47 0.939 0 0 1
+## 4: 4 1.92 0.560 0 1 1
+## 5: 5 1.40 -0.238 0 1 0
+## ---
+## 499996: 499996 -0.32 0.367 0 0 0
+## 499997: 499997 -1.08 2.152 0 0 0
+## 499998: 499998 -1.10 0.380 1 0 0
+## 499999: 499999 0.56 -1.042 0 1 0
+## 500000: 500000 0.52 0.076 0 1 1
+The overall proportion of \(y=1\) in
+this case is
+
+## [1] 0.56
+If we have a desired marginal proportion of 0.40, then we can add an
+intercept of -0.66 to the data generating process:
+\[ \text{logit}(y) = -0.66 + 0.15x_1 +
+0.25x_2 + 0.10b_1 + 0.30b_2\]
+The simulation now gives us the desired target:
+d1a <- defData(d1, varname = "y",
+ formula = "t(c(-0.66, ..coefs1)) %*% c(1, x1, x2, b1, b2)",
+ dist = "binary", link = "logit")
+
+genData(500000, d1a)[, mean(y)]
+## [1] 0.4
+If we change the distribution of the covariates, so that \(x_1 \sim N(1, 1)\), \(x_2 \sim N(2, 1)\), \(b_1 \sim Bin(0.5)\), and \(b_2 \sim Bin(0.8)\), and the strength of
+the association of these covariates with the outcome so that
+\[ \text{logit}(y) = 0.20x_1 + 0.35x_2 +
+0.20b_1 + 0.45b_2,\]
+the marginal proportion/prevalence (assuming no intercept term) also
+changes, going from 0.56 to 0.84:
+coefs2 <- c(0.20, 0.35, 0.20, 0.45)
+
+d2 <- defData(varname = "x1", formula = 1, variance = 1)
+d2 <- defData(d2, varname = "x2", formula = 3, variance = 1)
+d2 <- defData(d2, varname = "b1", formula = 0.5, dist = "binary")
+d2 <- defData(d2, varname = "b2", formula = 0.8, dist = "binary")
+
+d2a <- defData(d2, varname = "y",
+ formula = "t(..coefs2) %*% c(x1, x2, b1, b2)",
+ dist = "binary", link = "logit")
+
+genData(500000, d2a)[, mean(y)]
+## [1] 0.84
+But under this new distribution, adding an intercept of -2.13 yields
+the desired target.
+\[ \text{logit}(y) = -2.13 + 0.20x_1 +
+0.35x_2 + 0.20b_1 + 0.45b_2 \]
+
+d2a <- defData(d2, varname = "y",
+ formula = "t(c(-2.13, ..coefs2)) %*% c(1, x1, x2, b1, b2)",
+ dist = "binary", link = "logit")
+
+genData(500000, d1a)[, mean(y)]
+## [1] 0.4
+
+Finding the intercept
+Where did those two intercepts come from? The paper by Peter Austin describes an iterative
+bisection procedure that takes a distribution of covariates and a set of
+coefficients to identify the intercept coefficient that yields the
+target marginal proportion or prevalence.
+The general idea of the algorithm is to try out series of different
+intercepts in an intelligent way that ends up at the right spot. (If you
+want the details for the algorithm, take a look at the paper.) The starting search range is pre-defined
+(we’ve used -10 to 10 for the intercept), and we start with an value of
+0 for the initial intercept and simulate a large data set (the paper
+uses 1 million observations, but 100,000 seems to work just fine) and
+record the population prevalence. If we’ve overshot the target
+prevalence, we turn our attention to the range between -10 and 0, taking
+the average, which is -5. Otherwise, we focus on the range between 0 and
+10. We iterate this way, choosing the range we need to focus on and
+setting the intercept at the mid-point (hence the name
+bisection). The algorithm will converge pretty quickly on the
+value of the intercept that gives the target population prevalence for
+the underlying covariate distribution and coefficient assumptions.
+In the current implementation in simstudy
, the intercept
+is provided by a simple call to logisticCoefs
. Here are the
+calls for the two sets of definitions in definition tables d1
+and d2.
+
+## B0 x1 x2 b1 b2
+## -0.66 0.15 0.25 0.10 0.30
+
+## B0 x1 x2 b1 b2
+## -2.13 0.20 0.35 0.20 0.45
+
+
+
+Risk ratios
+Just as the prevalence depends on the distribution of covariates and
+their association with the outcome, risk ratios comparing the outcome
+probabilities for two groups also depend on the additional covariates.
+The marginal risk ratio comparing treatment (\(A =1\) to control (\(A=0\)) (given the distribution of
+covariates) is
+\[RR = \frac{P(y=1 | A = 1)}{P(y=1 | A =
+0)}\] In the data generation process we use a log-odds ratio of
+-0.40 (odds ratio of approximately 0.67) in both cases, but we get
+different risk ratios (0.82 vs. 0.93), depending on the covariates
+(defined in d1 and d2).
+d1a <- defData(d1, varname = "rx", formula = "1;1", dist = "trtAssign")
+d1a <- defData(d1a, varname = "y",
+ formula = "t(c(-0.40, ..coefs1)) %*% c(rx, x1, x2, b1, b2)",
+ dist = "binary", link = "logit"
+)
+
+dd <- genData(500000, d1a)
+dd[rx==1, mean(y)]/dd[rx==0, mean(y)]
+## [1] 0.82
+d2a <- defData(d2, varname = "rx", formula = "1;1", dist = "trtAssign")
+d2a <- defData(d2a, varname = "y",
+ formula = "t(c(-0.40, ..coefs2)) %*% c(rx, x1, x2, b1, b2)",
+ dist = "binary", link = "logit"
+)
+
+dd <- genData(500000, d2a)
+dd[rx==1, mean(y)]/dd[rx==0, mean(y)]
+## [1] 0.93
+By specifying both a population prevalence and a target risk ratio in
+the call to logisticCoefs
, we can get the necessary
+parameters. When specifying the target risk ratio, it is required to be
+between 0 and 1/popPrev. A risk ratio cannot be negative, and the
+probability of the outcome under treatment cannot exceed 1 (which will
+happen if the risk ratio is greater than 1/popPrev).
+
+## B0 A x1 x2 b1 b2
+## -0.66 -0.26 0.15 0.25 0.10 0.30
+If we use \(C_1\) in the data
+generation process, we will get a data set with the desired target
+prevalence and risk ratio:
+d1a <- defData(d1, varname = "rx", formula = "1;1", dist = "trtAssign")
+d1a <- defData(d1a, varname = "y",
+ formula = "t(..C1) %*% c(1, rx, x1, x2, b1, b2)",
+ dist = "binary", link = "logit"
+)
+
+dd <- genData(500000, d1a)
+Here are the prevalence and risk ratio:
+
+## [1] 0.4
+
+## [1] 0.86
+You can do the same for the second set of assumptions.
+
+
+Risk differences
+Risk differences have the same set of issues, and are handled in the
+same way. The risk difference is defined as
+\[ RD = P(y=1 | A = 1) - P(y=1 | A =
+0)\]
+To get the coefficients related to a population prevalence of 0.40
+and risk difference of -0.15 (so that the proportion in the exposure arm
+is 0.25), we use the rd argument:
+
+## B0 A x1 x2 b1 b2
+## -0.66 -0.71 0.15 0.25 0.10 0.30
+Again, using \(C_1\) in the data
+generation process, we will get a data set with the desired target
+prevalence and risk difference:
+d1a <- defData(d1, varname = "rx", formula = "1;1", dist = "trtAssign")
+d1a <- defData(d1a, varname = "y",
+ formula = "t(..C1) %*% c(1, rx, x1, x2, b1, b2)",
+ dist = "binary", link = "logit"
+)
+
+dd <- genData(500000, d1a)
+
+dd[rx==0, mean(y)]
+## [1] 0.4
+
+## [1] -0.15
+
+
+AUC
+The AUC is another commonly used statistic to evaluate a logistic
+model. We can use logisticCoefs
to find the parameters that
+will allow us to generate data from a model with a specific AUC. To get
+the coefficients related to a population prevalence of 0.40 and an AUC
+of 0.85, we use the auc argument (which must be between 0.5 and
+1):
+
+## B0 x1 x2 b1 b2
+## -1.99 0.85 1.41 0.56 1.69
+Again, using \(C_1\) in the data
+generation process, we will get a data set with the desired target
+prevalence and the AUC (calculated here using the lrm
+function in the rms
package:
+d1a <- defData(d1, varname = "y",
+ formula = "t(..C1) %*% c(1, x1, x2, b1, b2)",
+ dist = "binary", link = "logit"
+)
+
+dd <- genData(500000, d1a)
+
+dd[, mean(y)]
+## [1] 0.4
+
+## C
+## 0.85
+
+
References:
+Austin, Peter C. “The iterative bisection procedure: a useful tool
+for determining parameter values in data-generating processes in Monte
+Carlo simulations.” BMC Medical Research Methodology 23, no. 1 (2023):
+1-10.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/inst/doc/longitudinal.R b/inst/doc/longitudinal.R
index 73862a0..c862785 100644
--- a/inst/doc/longitudinal.R
+++ b/inst/doc/longitudinal.R
@@ -1,4 +1,7 @@
-## ---- echo = FALSE, message = FALSE-------------------------------------------
+## ----chunkname, echo=-1-------------------------------------------------------
+data.table::setDTthreads(2)
+
+## ----echo = FALSE, message = FALSE--------------------------------------------
library(simstudy)
library(ggplot2)
library(scales)
@@ -28,7 +31,7 @@ ggtheme <- function(panelback = "white") {
}
-## ---- tidy = TRUE-------------------------------------------------------------
+## ----tidy = TRUE--------------------------------------------------------------
tdef <- defData(varname = "T", dist="binary", formula = 0.5)
tdef <- defData(tdef, varname = "Y0", dist = "normal", formula = 10, variance = 1)
tdef <- defData(tdef, varname = "Y1", dist = "normal", formula = "Y0 + 5 + 5 * T", variance = 1)
@@ -39,11 +42,11 @@ set.seed (483726)
dtTrial <- genData( 500, tdef)
dtTrial
-## ---- tidy = TRUE-------------------------------------------------------------
+## ----tidy = TRUE--------------------------------------------------------------
dtTime <- addPeriods(dtTrial, nPeriods = 3, idvars = "id", timevars = c("Y0", "Y1", "Y2"), timevarName = "Y")
dtTime
-## ---- tidy = TRUE, echo = FALSE, fig.width = 6, fig.height = 3----------------
+## ----tidy = TRUE, echo = FALSE, fig.width = 6, fig.height = 3-----------------
avg <- dtTime[,.(Y=mean(Y)), keyby = .(T, period)]
@@ -57,7 +60,7 @@ ggplot(data = dtTime, aes(x = factor(period), y = Y)) +
ggtheme("grey90") +
theme(legend.key=element_rect(fill=NA))
-## ---- tidy = TRUE-------------------------------------------------------------
+## ----tidy = TRUE--------------------------------------------------------------
def <- defData(varname = "xbase", dist = "normal", formula = 20, variance = 3)
def <- defData(def,varname = "nCount", dist = "noZeroPoisson", formula = 6)
def <- defData(def, varname = "mInterval", dist = "gamma", formula = 30, variance = .01)
@@ -66,15 +69,15 @@ def <- defData(def, varname = "vInterval", dist = "nonrandom", formula = .07)
dt <- genData(200, def)
dt[id %in% c(8,121)] # View individuals 8 and 121
-## ---- tidy = TRUE-------------------------------------------------------------
+## ----tidy = TRUE--------------------------------------------------------------
dtPeriod <- addPeriods(dt)
dtPeriod[id %in% c(8,121)] # View individuals 8 and 121 only
-## ---- tidy = TRUE-------------------------------------------------------------
+## ----tidy = TRUE--------------------------------------------------------------
def2 <- defDataAdd(varname = "Y", dist = "normal", formula = "15 + .1 * time", variance = 5)
dtPeriod <- addColumns(def2, dtPeriod)
-## ---- tidy = TRUE, echo = FALSE, fig.width = 6, fig.height = 3----------------
+## ----tidy = TRUE, echo = FALSE, fig.width = 6, fig.height = 3-----------------
sampledID <- sample(1:nrow(dt), 5)
dtSample <- dtPeriod[id %in% sampledID]
diff --git a/inst/doc/longitudinal.Rmd b/inst/doc/longitudinal.Rmd
index 084c2f5..7032300 100644
--- a/inst/doc/longitudinal.Rmd
+++ b/inst/doc/longitudinal.Rmd
@@ -7,6 +7,10 @@ vignette: >
\usepackage[utf8]{inputenc}
---
+```{r chunkname, echo=-1}
+data.table::setDTthreads(2)
+```
+
```{r, echo = FALSE, message = FALSE}
library(simstudy)
library(ggplot2)
diff --git a/inst/doc/longitudinal.html b/inst/doc/longitudinal.html
index f3ff8a1..752dc90 100644
--- a/inst/doc/longitudinal.html
+++ b/inst/doc/longitudinal.html
@@ -346,29 +346,29 @@ Longitudinal Data
the columns. In the next example, we measure outcome Y
once
before and twice after intervention T
in a randomized
trial:
-<- defData(varname = "T", dist = "binary", formula = 0.5)
- tdef <- defData(tdef, varname = "Y0", dist = "normal", formula = 10, variance = 1)
- tdef <- defData(tdef, varname = "Y1", dist = "normal", formula = "Y0 + 5 + 5 * T",
- tdef variance = 1)
- <- defData(tdef, varname = "Y2", dist = "normal", formula = "Y0 + 10 + 5 * T",
- tdef variance = 1)
-
-set.seed(483726)
-
-<- genData(500, tdef)
- dtTrial dtTrial
-## id T Y0 Y1 Y2
-## 1: 1 0 10.42 14.2 20.9
-## 2: 2 1 9.36 21.9 25.4
-## 3: 3 1 8.25 19.1 23.9
-## 4: 4 0 10.86 16.5 20.4
-## 5: 5 1 11.29 21.7 25.9
-## ---
-## 496: 496 1 11.87 22.6 26.6
-## 497: 497 1 10.62 19.5 24.9
-## 498: 498 1 10.13 22.2 26.1
-## 499: 499 0 11.88 17.9 23.0
-## 500: 500 0 11.92 18.4 22.5
+tdef <- defData(varname = "T", dist = "binary", formula = 0.5)
+tdef <- defData(tdef, varname = "Y0", dist = "normal", formula = 10, variance = 1)
+tdef <- defData(tdef, varname = "Y1", dist = "normal", formula = "Y0 + 5 + 5 * T",
+ variance = 1)
+tdef <- defData(tdef, varname = "Y2", dist = "normal", formula = "Y0 + 10 + 5 * T",
+ variance = 1)
+
+set.seed(483726)
+
+dtTrial <- genData(500, tdef)
+dtTrial
+## id T Y0 Y1 Y2
+## 1: 1 0 10.4 14 21
+## 2: 2 1 9.4 22 25
+## 3: 3 1 8.3 19 24
+## 4: 4 0 10.9 16 20
+## 5: 5 1 11.3 22 26
+## ---
+## 496: 496 1 11.9 23 27
+## 497: 497 1 10.6 20 25
+## 498: 498 1 10.1 22 26
+## 499: 499 0 11.9 18 23
+## 500: 500 0 11.9 18 23
Longitudinal data are created with a call to
addPeriods
. If the cross-sectional data
includes time-dependent data, then the number of periods
@@ -379,23 +379,28 @@
Longitudinal Data
time-dependent variable. (Note: if there are two time-dependent
variables, it is best to create two data sets and merge them. This will
be shown later in the vignette).
-<- addPeriods(dtTrial, nPeriods = 3, idvars = "id", timevars = c("Y0", "Y1",
- dtTime "Y2"), timevarName = "Y")
- dtTime
-## id period T Y timeID
-## 1: 1 0 0 10.42 1
-## 2: 1 1 0 14.15 2
-## 3: 1 2 0 20.93 3
-## 4: 2 0 1 9.36 4
-## 5: 2 1 1 21.90 5
-## ---
-## 1496: 499 1 0 17.94 1496
-## 1497: 499 2 0 23.05 1497
-## 1498: 500 0 0 11.92 1498
-## 1499: 500 1 0 18.39 1499
-## 1500: 500 2 0 22.50 1500
+dtTime <- addPeriods(dtTrial, nPeriods = 3, idvars = "id", timevars = c("Y0", "Y1",
+ "Y2"), timevarName = "Y")
+dtTime
+## id period T Y timeID
+## 1: 1 0 0 10.4 1
+## 2: 1 1 0 14.2 2
+## 3: 1 2 0 20.9 3
+## 4: 2 0 1 9.4 4
+## 5: 2 1 1 21.9 5
+## ---
+## 1496: 499 1 0 17.9 1496
+## 1497: 499 2 0 23.0 1497
+## 1498: 500 0 0 11.9 1498
+## 1499: 500 1 0 18.4 1499
+## 1500: 500 2 0 22.5 1500
This is what the longitudinal data look like:
-
+## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
+## ℹ Please use `linewidth` instead.
+## This warning is displayed once every 8 hours.
+## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
+## generated.
+
Longitudinal data with varying observation and interval times
It is also possible to generate longitudinal data with varying
@@ -415,36 +420,36 @@
Longitudinal data with varying observation and interval times
individuals with a different number of measurement observations and
different times between each observation. Data for two of these
individuals is printed:
-<- defData(varname = "xbase", dist = "normal", formula = 20, variance = 3)
- def <- defData(def, varname = "nCount", dist = "noZeroPoisson", formula = 6)
- def <- defData(def, varname = "mInterval", dist = "gamma", formula = 30, variance = 0.01)
- def <- defData(def, varname = "vInterval", dist = "nonrandom", formula = 0.07)
- def
-<- genData(200, def)
- dt %in% c(8, 121)] # View individuals 8 and 121 dt[id
+def <- defData(varname = "xbase", dist = "normal", formula = 20, variance = 3)
+def <- defData(def, varname = "nCount", dist = "noZeroPoisson", formula = 6)
+def <- defData(def, varname = "mInterval", dist = "gamma", formula = 30, variance = 0.01)
+def <- defData(def, varname = "vInterval", dist = "nonrandom", formula = 0.07)
+
+dt <- genData(200, def)
+dt[id %in% c(8, 121)] # View individuals 8 and 121
## id xbase nCount mInterval vInterval
-## 1: 8 18.0 4 28.1 0.07
-## 2: 121 22.7 6 33.2 0.07
+## 1: 8 18 4 28 0.07
+## 2: 121 23 6 33 0.07
The resulting longitudinal data for these two subjects can be
inspected after a call to addPeriods
. Notice that no
parameters need to be set since all information resides in the data set
itself:
-<- addPeriods(dt)
- dtPeriod %in% c(8, 121)] # View individuals 8 and 121 only dtPeriod[id
+
## id period xbase time timeID
-## 1: 8 0 18.0 0 41
-## 2: 8 1 18.0 29 42
-## 3: 8 2 18.0 51 43
-## 4: 8 3 18.0 104 44
-## 5: 121 0 22.7 0 691
-## 6: 121 1 22.7 46 692
-## 7: 121 2 22.7 81 693
-## 8: 121 3 22.7 117 694
-## 9: 121 4 22.7 154 695
-## 10: 121 5 22.7 180 696
+## 1: 8 0 18 0 41
+## 2: 8 1 18 29 42
+## 3: 8 2 18 51 43
+## 4: 8 3 18 104 44
+## 5: 121 0 23 0 691
+## 6: 121 1 23 46 692
+## 7: 121 2 23 81 693
+## 8: 121 3 23 117 694
+## 9: 121 4 23 154 695
+## 10: 121 5 23 180 696
If a time-sensitive measurement is added to the data set …
-<- defDataAdd(varname = "Y", dist = "normal", formula = "15 + .1 * time", variance = 5)
- def2 <- addColumns(def2, dtPeriod) dtPeriod
+def2 <- defDataAdd(varname = "Y", dist = "normal", formula = "15 + .1 * time", variance = 5)
+dtPeriod <- addColumns(def2, dtPeriod)
… a plot of five randomly selected individuals looks like this:
diff --git a/inst/doc/missing.R b/inst/doc/missing.R
index 21feeb0..dd2f4f2 100644
--- a/inst/doc/missing.R
+++ b/inst/doc/missing.R
@@ -1,4 +1,7 @@
-## ---- echo = FALSE, message = FALSE-------------------------------------------
+## ----chunkname, echo=-1-------------------------------------------------------
+data.table::setDTthreads(2)
+
+## ----echo = FALSE, message = FALSE--------------------------------------------
library(simstudy)
library(ggplot2)
library(scales)
@@ -55,7 +58,7 @@ ggtheme <- function(panelback = "white") {
}
-## ---- tidy = TRUE-------------------------------------------------------------
+## ----tidy = TRUE--------------------------------------------------------------
def1 <- defData(varname = "m", dist = "binary", formula = .5)
def1 <- defData(def1, "u", dist = "binary", formula = .5)
def1 <- defData(def1, "x1", dist="normal", formula = "20*m + 20*u", variance = 2)
@@ -64,7 +67,7 @@ def1 <- defData(def1, "x3", dist="normal", formula = "20*m + 20*u", variance = 2
dtAct <- genData(1000, def1)
-## ---- tidy = TRUE-------------------------------------------------------------
+## ----tidy = TRUE--------------------------------------------------------------
defM <- defMiss(varname = "x1", formula = .15, logit.link = FALSE)
defM <- defMiss(defM, varname = "x2", formula = ".05 + m * 0.25", logit.link = FALSE)
defM <- defMiss(defM, varname = "x3", formula = ".05 + u * 0.25", logit.link = FALSE)
@@ -75,7 +78,7 @@ set.seed(283726)
missMat <- genMiss(dtAct, defM, idvars = "id")
dtObs <- genObs(dtAct, missMat, idvars = "id")
-## ---- tidy = TRUE-------------------------------------------------------------
+## ----tidy = TRUE--------------------------------------------------------------
missMat
dtObs
@@ -103,7 +106,7 @@ showDif(meanAct, meanObs)
meanActm <- dtAct[,.(x1 = rmean(x1), x2 = rmean(x2), x3 = rmean(x3)), keyby = m]
meanObsm <- dtObs[,.(x1 = rmean(x1), x2 = rmean(x2), x3 = rmean(x3)), keyby = m]
-## ---- tidy = TRUE-------------------------------------------------------------
+## ----tidy = TRUE--------------------------------------------------------------
# compare observed and actual when m = 0
showDif(meanActm[m==0, .(x1, x2, x3)], meanObsm[m==0, .(x1, x2, x3)])
@@ -112,7 +115,7 @@ showDif(meanActm[m==0, .(x1, x2, x3)], meanObsm[m==0, .(x1, x2, x3)])
showDif(meanActm[m==1, .(x1, x2, x3)], meanObsm[m==1, .(x1, x2, x3)])
-## ---- tidy = TRUE-------------------------------------------------------------
+## ----tidy = TRUE--------------------------------------------------------------
# use baseline definitions from the previous example
@@ -126,7 +129,7 @@ defLong <- defDataAdd(varname = "y", dist = "normal", formula = "10 + period*2 +
dtTime <- addPeriods(dtAct, nPeriods = 4)
dtTime <- addColumns(defLong, dtTime)
-## ---- tidy = TRUE-------------------------------------------------------------
+## ----tidy = TRUE--------------------------------------------------------------
# missingness for y is not monotonic
diff --git a/inst/doc/missing.Rmd b/inst/doc/missing.Rmd
index 4aa2f74..ce4f5d8 100644
--- a/inst/doc/missing.Rmd
+++ b/inst/doc/missing.Rmd
@@ -7,6 +7,10 @@ vignette: >
\usepackage[utf8]{inputenc}
---
+```{r chunkname, echo=-1}
+data.table::setDTthreads(2)
+```
+
```{r, echo = FALSE, message = FALSE}
library(simstudy)
library(ggplot2)
diff --git a/inst/doc/missing.html b/inst/doc/missing.html
index 28c4ea8..819dde2 100644
--- a/inst/doc/missing.html
+++ b/inst/doc/missing.html
@@ -363,13 +363,13 @@ Missing Data
includes two independent predictors, m
and u
that largely determine the value of each outcome (subject to random
noise).
-<- defData(varname = "m", dist = "binary", formula = 0.5)
- def1 <- defData(def1, "u", dist = "binary", formula = 0.5)
- def1 <- defData(def1, "x1", dist = "normal", formula = "20*m + 20*u", variance = 2)
- def1 <- defData(def1, "x2", dist = "normal", formula = "20*m + 20*u", variance = 2)
- def1 <- defData(def1, "x3", dist = "normal", formula = "20*m + 20*u", variance = 2)
- def1
-<- genData(1000, def1) dtAct
+def1 <- defData(varname = "m", dist = "binary", formula = 0.5)
+def1 <- defData(def1, "u", dist = "binary", formula = 0.5)
+def1 <- defData(def1, "x1", dist = "normal", formula = "20*m + 20*u", variance = 2)
+def1 <- defData(def1, "x2", dist = "normal", formula = "20*m + 20*u", variance = 2)
+def1 <- defData(def1, "x3", dist = "normal", formula = "20*m + 20*u", variance = 2)
+
+dtAct <- genData(1000, def1)
In this example, the missing data mechanism is different for each
outcome. As defined below, missingness for x1
is MCAR,
since the probability of missing is fixed. Missingness for
@@ -378,16 +378,16 @@
Missing Data
for x3
is NMAR, since the probability of missing is
dependent on u
, an unmeasured predictor of
x3
:
-<- defMiss(varname = "x1", formula = 0.15, logit.link = FALSE)
- defM <- defMiss(defM, varname = "x2", formula = ".05 + m * 0.25", logit.link = FALSE)
- defM <- defMiss(defM, varname = "x3", formula = ".05 + u * 0.25", logit.link = FALSE)
- defM <- defMiss(defM, varname = "u", formula = 1, logit.link = FALSE) # not observed
- defM
-set.seed(283726)
-
-<- genMiss(dtAct, defM, idvars = "id")
- missMat <- genObs(dtAct, missMat, idvars = "id") dtObs
- missMat
+defM <- defMiss(varname = "x1", formula = 0.15, logit.link = FALSE)
+defM <- defMiss(defM, varname = "x2", formula = ".05 + m * 0.25", logit.link = FALSE)
+defM <- defMiss(defM, varname = "x3", formula = ".05 + u * 0.25", logit.link = FALSE)
+defM <- defMiss(defM, varname = "u", formula = 1, logit.link = FALSE) # not observed
+
+set.seed(283726)
+
+missMat <- genMiss(dtAct, defM, idvars = "id")
+dtObs <- genObs(dtAct, missMat, idvars = "id")
+
## id x1 x2 x3 u m
## 1: 1 0 0 0 1 0
## 2: 2 0 0 0 1 0
@@ -400,19 +400,19 @@ Missing Data
## 998: 998 0 0 1 1 0
## 999: 999 0 0 0 1 0
## 1000: 1000 0 0 0 1 0
- dtObs
-## id m u x1 x2 x3
-## 1: 1 0 NA 2.37 -0.0938 1.79
-## 2: 2 0 NA 21.00 19.0643 21.76
-## 3: 3 0 NA NA -2.1260 -2.04
-## 4: 4 0 NA NA 0.6345 2.91
-## 5: 5 0 NA NA NA 17.11
-## ---
-## 996: 996 0 NA 17.71 19.6705 18.74
-## 997: 997 1 NA NA 40.8860 NA
-## 998: 998 0 NA 19.83 20.6475 NA
-## 999: 999 1 NA 20.29 23.6448 18.81
-## 1000: 1000 0 NA 20.99 20.3019 19.97
+
+## id m u x1 x2 x3
+## 1: 1 0 NA 2.4 -0.094 1.8
+## 2: 2 0 NA 21.0 19.064 21.8
+## 3: 3 0 NA NA -2.126 -2.0
+## 4: 4 0 NA NA 0.635 2.9
+## 5: 5 0 NA NA NA 17.1
+## ---
+## 996: 996 0 NA 17.7 19.671 18.7
+## 997: 997 1 NA NA 40.886 NA
+## 998: 998 0 NA 19.8 20.647 NA
+## 999: 999 1 NA 20.3 23.645 18.8
+## 1000: 1000 0 NA 21.0 20.302 20.0
The impacts of the various data mechanisms on estimation can be seen
with a simple calculation of means using both the “true” data set
without missing data as a comparison for the “observed” data set. Since
@@ -420,24 +420,24 @@
Missing Data
equivalent. However, we can see below that estimates for x2
and x3
are biased, as the difference between observed and
actual is not close to 0:
-# Two functions to calculate means and compare them
-
-<- function(var, digits = 1) {
- rmean round(mean(var, na.rm = TRUE), digits)
-
- }
-<- function(dt1, dt2, rowName = c("Actual", "Observed", "Difference")) {
- showDif <- data.frame(rbind(dt1, dt2, dt1 - dt2))
- dt rownames(dt) <- rowName
- return(dt)
-
- }
-# data.table functionality to estimate means for each data set
-
-<- dtAct[, .(x1 = rmean(x1), x2 = rmean(x2), x3 = rmean(x3))]
- meanAct <- dtObs[, .(x1 = rmean(x1), x2 = rmean(x2), x3 = rmean(x3))]
- meanObs
-showDif(meanAct, meanObs)
+# Two functions to calculate means and compare them
+
+rmean <- function(var, digits = 1) {
+ round(mean(var, na.rm = TRUE), digits)
+}
+
+showDif <- function(dt1, dt2, rowName = c("Actual", "Observed", "Difference")) {
+ dt <- data.frame(rbind(dt1, dt2, dt1 - dt2))
+ rownames(dt) <- rowName
+ return(dt)
+}
+
+# data.table functionality to estimate means for each data set
+
+meanAct <- dtAct[, .(x1 = rmean(x1), x2 = rmean(x2), x3 = rmean(x3))]
+meanObs <- dtObs[, .(x1 = rmean(x1), x2 = rmean(x2), x3 = rmean(x3))]
+
+showDif(meanAct, meanObs)
## x1 x2 x3
## Actual 19.8 19.8 19.8
## Observed 19.9 18.4 18.0
@@ -445,18 +445,18 @@ Missing Data
After adjusting for the measured covariate m
, the bias
for the estimate of the mean of x2
is mitigated, but not
for x3
, since u
is not observed:
-<- dtAct[, .(x1 = rmean(x1), x2 = rmean(x2), x3 = rmean(x3)), keyby = m]
- meanActm <- dtObs[, .(x1 = rmean(x1), x2 = rmean(x2), x3 = rmean(x3)), keyby = m] meanObsm
-# compare observed and actual when m = 0
-
-showDif(meanActm[m == 0, .(x1, x2, x3)], meanObsm[m == 0, .(x1, x2, x3)])
+meanActm <- dtAct[, .(x1 = rmean(x1), x2 = rmean(x2), x3 = rmean(x3)), keyby = m]
+meanObsm <- dtObs[, .(x1 = rmean(x1), x2 = rmean(x2), x3 = rmean(x3)), keyby = m]
+# compare observed and actual when m = 0
+
+showDif(meanActm[m == 0, .(x1, x2, x3)], meanObsm[m == 0, .(x1, x2, x3)])
## x1 x2 x3
## Actual 9.7 9.8 9.7
## Observed 9.8 9.9 8.4
## Difference -0.1 -0.1 1.3
-# compare observed and actual when m = 1
-
-showDif(meanActm[m == 1, .(x1, x2, x3)], meanObsm[m == 1, .(x1, x2, x3)])
+# compare observed and actual when m = 1
+
+showDif(meanActm[m == 1, .(x1, x2, x3)], meanObsm[m == 1, .(x1, x2, x3)])
## x1 x2 x3
## Actual 30.4 30.4 30.4
## Observed 30.7 31.0 28.6
@@ -477,28 +477,28 @@ Longitudinal data with missingness
The following two examples describe an outcome variable
y
that is measured over time, whose value is a function of
time and an observed exposure:
-# use baseline definitions from the previous example
-
-<- genData(120, def1)
- dtAct <- trtObserve(dtAct, formulas = 0.5, logit.link = FALSE, grpName = "rx")
- dtAct
-# add longitudinal data
-
-<- defDataAdd(varname = "y", dist = "normal", formula = "10 + period*2 + 2 * rx",
- defLong variance = 2)
-
-<- addPeriods(dtAct, nPeriods = 4)
- dtTime <- addColumns(defLong, dtTime) dtTime
+# use baseline definitions from the previous example
+
+dtAct <- genData(120, def1)
+dtAct <- trtObserve(dtAct, formulas = 0.5, logit.link = FALSE, grpName = "rx")
+
+# add longitudinal data
+
+defLong <- defDataAdd(varname = "y", dist = "normal", formula = "10 + period*2 + 2 * rx",
+ variance = 2)
+
+dtTime <- addPeriods(dtAct, nPeriods = 4)
+dtTime <- addColumns(defLong, dtTime)
In the first case, missingness is not monotonic; a subject might miss
a measurement but returns for subsequent measurements:
-# missingness for y is not monotonic
-
-<- defMiss(varname = "x1", formula = 0.2, baseline = TRUE)
- defMlong <- defMiss(defMlong, varname = "y", formula = "-1.5 - 1.5 * rx + .25*period",
- defMlong logit.link = TRUE, baseline = FALSE, monotonic = FALSE)
-
-<- genMiss(dtTime, defMlong, idvars = c("id", "rx"), repeated = TRUE,
- missMatLong periodvar = "period")
+# missingness for y is not monotonic
+
+defMlong <- defMiss(varname = "x1", formula = 0.2, baseline = TRUE)
+defMlong <- defMiss(defMlong, varname = "y", formula = "-1.5 - 1.5 * rx + .25*period",
+ logit.link = TRUE, baseline = FALSE, monotonic = FALSE)
+
+missMatLong <- genMiss(dtTime, defMlong, idvars = c("id", "rx"), repeated = TRUE,
+ periodvar = "period")
Here is a conceptual plot that shows the pattern of missingness. Each
row represents an individual, and each box represents a time period. A
box that is colored reflects missing data; a box colored grey reflects
@@ -508,14 +508,14 @@
Longitudinal data with missingness
In the second case, missingness is monotonic; once a subject misses a
measurement for y
, there are no subsequent
measurements:
-# missingness for y is monotonic
-
-<- defMiss(varname = "x1", formula = 0.2, baseline = TRUE)
- defMlong <- defMiss(defMlong, varname = "y", formula = "-1.8 - 1.5 * rx + .25*period",
- defMlong logit.link = TRUE, baseline = FALSE, monotonic = TRUE)
-
-<- genMiss(dtTime, defMlong, idvars = c("id", "rx"), repeated = TRUE,
- missMatLong periodvar = "period")
+# missingness for y is monotonic
+
+defMlong <- defMiss(varname = "x1", formula = 0.2, baseline = TRUE)
+defMlong <- defMiss(defMlong, varname = "y", formula = "-1.8 - 1.5 * rx + .25*period",
+ logit.link = TRUE, baseline = FALSE, monotonic = TRUE)
+
+missMatLong <- genMiss(dtTime, defMlong, idvars = c("id", "rx"), repeated = TRUE,
+ periodvar = "period")
diff --git a/inst/doc/ordinal.R b/inst/doc/ordinal.R
index b73b0d3..1fac4c4 100644
--- a/inst/doc/ordinal.R
+++ b/inst/doc/ordinal.R
@@ -1,4 +1,7 @@
-## ---- echo = FALSE, message = FALSE-------------------------------------------
+## ----chunkname, echo=-1-------------------------------------------------------
+data.table::setDTthreads(2)
+
+## ----echo = FALSE, message = FALSE--------------------------------------------
library(simstudy)
library(ggplot2)
library(scales)
@@ -159,7 +162,7 @@ summary(clmFit)
## -----------------------------------------------------------------------------
logOdds.expos - logOdds.unexp
-## ---- echo=FALSE, fig.width=6, fig.height=3.5---------------------------------
+## ----echo=FALSE, fig.width=6, fig.height=3.5----------------------------------
getCumProbs <- function(coefs) {
cumprob0 <- data.table(
@@ -264,7 +267,7 @@ dX <- genOrdCat(dT_1_cat, baseprobs = baseprobs, adjVar = "z",
## -----------------------------------------------------------------------------
logOdds.expos - logOdds.unexp
-## ---- echo=FALSE, fig.width=6, fig.height=3.5, warning=FALSE------------------
+## ----echo=FALSE, fig.width=6, fig.height=3.5, warning=FALSE-------------------
fitPlot(dX)
## -----------------------------------------------------------------------------
diff --git a/inst/doc/ordinal.Rmd b/inst/doc/ordinal.Rmd
index 162d5f6..ba365ff 100644
--- a/inst/doc/ordinal.Rmd
+++ b/inst/doc/ordinal.Rmd
@@ -7,6 +7,10 @@ vignette: >
\usepackage[utf8]{inputenc}
---
+```{r chunkname, echo=-1}
+data.table::setDTthreads(2)
+```
+
```{r, echo = FALSE, message = FALSE}
library(simstudy)
library(ggplot2)
diff --git a/inst/doc/ordinal.html b/inst/doc/ordinal.html
index 041067c..e754852 100644
--- a/inst/doc/ordinal.html
+++ b/inst/doc/ordinal.html
@@ -414,21 +414,21 @@ The cumulative proportional odds model
Simulation
To generate ordered categorical data using simstudy
,
there is a function genOrdCat
.
-<- c(0.31, 0.29, .20, 0.20)
- baseprobs
-<- defData(varname = "exposed", formula = "1;1", dist = "trtAssign")
- defA <- defData(defA, varname = "z", formula = "-0.7*exposed", dist = "nonrandom")
- defA
-set.seed(130)
-
-<- genData(25000, defA)
- dT_1_cat
-<- genOrdCat(dT_1_cat, adjVar = "z", baseprobs, catVar = "r") dX
+baseprobs <- c(0.31, 0.29, .20, 0.20)
+
+defA <- defData(varname = "exposed", formula = "1;1", dist = "trtAssign")
+defA <- defData(defA, varname = "z", formula = "-0.7*exposed", dist = "nonrandom")
+
+set.seed(130)
+
+dT_1_cat <- genData(25000, defA)
+
+dX <- genOrdCat(dT_1_cat, adjVar = "z", baseprobs, catVar = "r")
Estimating the parameters of the model using function
clm
, we can recover the original parameters quite well.
-library(ordinal)
-<- clm(r ~ exposed, data = dX)
- clmFit summary(clmFit)
+
## formula: r ~ exposed
## data: dX
##
@@ -453,16 +453,16 @@ Simulation
match the thresholds for the unexposed group.
The log of the cumulative odds for groups 1 to 4 from the data
without exposure are
-<- log(odds(cumsum(dX[exposed == 0, prop.table(table(r))])))[1:3]) (logOdds.unexp
+
## 1 2 3
## -0.83 0.38 1.36
And under exposure:
-<- log(odds(cumsum(dX[exposed == 1, prop.table(table(r))])))[1:3]) (logOdds.expos
+
## 1 2 3
## -0.084 1.135 2.147
The log of the cumulative odds ratios for each of the four groups
is
-- logOdds.unexp logOdds.expos
+
## 1 2 3
## 0.75 0.76 0.79
A plot of the modeled cumulative probabilities (the lines) shows that
@@ -488,22 +488,22 @@
Non-proportional odds
violated, and npAdj
indicates the various shifts of the
intervals. (Note that the last cut point is at Inf
, so
there is no impact of a shift related to that threshold.)
-<- c(0.31, 0.29, .20, 0.20)
- baseprobs <- c(-0.4, 0.0, -1.7, 0)
- npAdj
-<- genOrdCat(dT_1_cat, baseprobs = baseprobs, adjVar = "z",
- dX catVar = "r", npVar = "exposed", npAdj = npAdj)
+baseprobs <- c(0.31, 0.29, .20, 0.20)
+npAdj <- c(-0.4, 0.0, -1.7, 0)
+
+dX <- genOrdCat(dT_1_cat, baseprobs = baseprobs, adjVar = "z",
+ catVar = "r", npVar = "exposed", npAdj = npAdj)
The calculation of the log cumulative odds follows as before:
-<- log(odds(cumsum(dX[exposed == 0, prop.table(table(r))])))[1:3]) (logOdds.unexp
+
## 1 2 3
## -0.80 0.42 1.40
And under exposure:
-<- log(odds(cumsum(dX[exposed == 1, prop.table(table(r))])))[1:3]) (logOdds.expos
+
## 1 2 3
## 0.29 1.10 3.72
But, now, the log of the cumulative odds ratios for each of the four
groups varies across the different levels.
-- logOdds.unexp logOdds.expos
+
## 1 2 3
## 1.09 0.69 2.32
This is confirmed by a plot of the model fit with a proportional odds
@@ -532,23 +532,23 @@
Correlated multivariate ordinal data
possible responses: “none”, “some”, “a lot”. The probabilities of
response are specified in a \(5 \times
3\) matrix, and the rows sum to 1:
-<- matrix(c(0.2, 0.1, 0.7,
- baseprobs 0.7, 0.2, 0.1,
- 0.5, 0.2, 0.3,
- 0.4, 0.2, 0.4,
- 0.6, 0.2, 0.2),
- nrow = 5, byrow = TRUE)
-
-# generate the data
-
-set.seed(333)
-<- genData(10000)
- dT_5_cat
-<- genOrdCat(dT_5_cat, adjVar = NULL, baseprobs = baseprobs,
- dX prefix = "q", rho = 0.15, corstr = "cs", asFactor = FALSE)
+baseprobs <- matrix(c(0.2, 0.1, 0.7,
+ 0.7, 0.2, 0.1,
+ 0.5, 0.2, 0.3,
+ 0.4, 0.2, 0.4,
+ 0.6, 0.2, 0.2),
+ nrow = 5, byrow = TRUE)
+
+# generate the data
+
+set.seed(333)
+dT_5_cat <- genData(10000)
+
+dX <- genOrdCat(dT_5_cat, adjVar = NULL, baseprobs = baseprobs,
+ prefix = "q", rho = 0.15, corstr = "cs", asFactor = FALSE)
The observed correlation of the items is slightly less than the
specified correlations as expected:
-round(dX[, cor(cbind(q1, q2, q3, q4, q5))], 2)
+
## q1 q2 q3 q4 q5
## q1 1.00 0.08 0.10 0.10 0.08
## q2 0.08 1.00 0.09 0.09 0.09
@@ -557,20 +557,20 @@ Correlated multivariate ordinal data
## q5 0.08 0.09 0.11 0.10 1.00
However, the marginal probability distributions of each item match
quite closely with the specified probabilities:
-<- melt(dX, id.vars = "id")
- dM <- dM[ , prop.table(table(value)), by = variable]
- dProp := rep(seq(3), 5)]
- dProp[, response
-# observed probabilities
-dcast(dProp, variable ~ response, value.var = "V1", fill = 0)
+dM <- melt(dX, id.vars = "id")
+dProp <- dM[ , prop.table(table(value)), by = variable]
+dProp[, response := rep(seq(3), 5)]
+
+# observed probabilities
+dcast(dProp, variable ~ response, value.var = "V1", fill = 0)
## variable 1 2 3
## 1: q1 0.20 0.10 0.70
## 2: q2 0.69 0.21 0.10
## 3: q3 0.50 0.20 0.30
## 4: q4 0.40 0.20 0.40
## 5: q5 0.60 0.20 0.21
-# specified probabilities
- baseprobs
+
## [,1] [,2] [,3]
## [1,] 0.2 0.1 0.7
## [2,] 0.7 0.2 0.1
@@ -581,23 +581,23 @@ Correlated multivariate ordinal data
AR-1, so the correlation between questions closer to each other is
higher than for questions farther apart. But the probability
distributions are unaffected:
-<- genOrdCat(dT_5_cat, adjVar = NULL, baseprobs = baseprobs,
- dX prefix = "q", rho = 0.40, corstr = "ar1", asFactor = FALSE)
-
-# correlation
-round(dX[, cor(cbind(q1, q2, q3, q4, q5))], 2)
+dX <- genOrdCat(dT_5_cat, adjVar = NULL, baseprobs = baseprobs,
+ prefix = "q", rho = 0.40, corstr = "ar1", asFactor = FALSE)
+
+# correlation
+round(dX[, cor(cbind(q1, q2, q3, q4, q5))], 2)
## q1 q2 q3 q4 q5
## q1 1.00 0.22 0.10 0.05 0.02
## q2 0.22 1.00 0.29 0.10 0.03
## q3 0.10 0.29 1.00 0.31 0.11
## q4 0.05 0.10 0.31 1.00 0.29
## q5 0.02 0.03 0.11 0.29 1.00
-<- melt(dX, id.vars = "id")
- dM <- dM[ , prop.table(table(value)), by = variable]
- dProp := rep(seq(3), 5)]
- dProp[, response
-# probabilities
-dcast(dProp, variable ~ response, value.var = "V1", fill = 0)
+dM <- melt(dX, id.vars = "id")
+dProp <- dM[ , prop.table(table(value)), by = variable]
+dProp[, response := rep(seq(3), 5)]
+
+# probabilities
+dcast(dProp, variable ~ response, value.var = "V1", fill = 0)
## variable 1 2 3
## 1: q1 0.20 0.10 0.692
## 2: q2 0.70 0.20 0.099
diff --git a/inst/doc/simstudy.R b/inst/doc/simstudy.R
index 4cdbdc0..f3934d0 100644
--- a/inst/doc/simstudy.R
+++ b/inst/doc/simstudy.R
@@ -1,4 +1,7 @@
-## ---- echo = FALSE, message = FALSE-------------------------------------------
+## ----chunkname, echo=-1-------------------------------------------------------
+data.table::setDTthreads(2)
+
+## ----echo = FALSE, message = FALSE--------------------------------------------
library(simstudy)
library(data.table)
@@ -30,7 +33,7 @@ ggtheme <- function(panelback = "white") {
}
-## ---- echo=FALSE-------------------------------------------------------------
+## ----echo=FALSE---------------------------------------------------------------
def <- defData(varname="age", dist="normal", formula=10, variance = 2)
def <- defData(def, varname="female", dist="binary",
formula="-2 + age * 0.1", link = "logit")
@@ -107,7 +110,7 @@ for (i in seq_along(age_effects)) {
list_of_data
-## ---- echo=FALSE-------------------------------------------------------------
+## ----echo=FALSE---------------------------------------------------------------
d <- list()
d[[1]] <- data.table("beta", "mean", "both", "-", "dispersion", "X", "-", "X")
d[[2]] <- data.table("binary", "probability", "both", "-", "-", "X", "-", "X")
@@ -169,7 +172,7 @@ dc <- defCondition(dc, condition = "x > 2", formula = "-5 + 4*x",
dd <- genData(1000, d)
dd <- addCondition(dc, dd, newvar = "y")
-## ---- fig.width = 5, fig.height = 3, echo=FALSE, message=FALSE----------------
+## ----fig.width = 5, fig.height = 3, echo=FALSE, message=FALSE-----------------
ggplot(data = dd, aes(y = y, x = x)) +
geom_point(color = " grey60", size = .5) +
geom_smooth(se = FALSE, size = .5) +
diff --git a/inst/doc/simstudy.Rmd b/inst/doc/simstudy.Rmd
index 654a157..a97541b 100644
--- a/inst/doc/simstudy.Rmd
+++ b/inst/doc/simstudy.Rmd
@@ -7,6 +7,10 @@ vignette: >
\usepackage[utf8]{inputenc}
---
+```{r chunkname, echo=-1}
+data.table::setDTthreads(2)
+```
+
```{r, echo = FALSE, message = FALSE}
library(simstudy)
diff --git a/inst/doc/simstudy.html b/inst/doc/simstudy.html
index 5ca6b89..fe38f3c 100644
--- a/inst/doc/simstudy.html
+++ b/inst/doc/simstudy.html
@@ -425,12 +425,12 @@ Defining the Data
can be read in with a call to defRead
. An alternative is to
make repeated calls to the function defData
. This script
builds a definition table internally:
-<- defData(varname = "age", dist = "normal", formula = 10,
- def variance = 2)
- <- defData(def, varname = "female", dist = "binary",
- def formula = "-2 + age * 0.1", link = "logit")
- <- defData(def, varname = "visits", dist = "poisson",
- def formula = "1.5 - 0.2 * age + 0.5 * female", link = "log")
+def <- defData(varname = "age", dist = "normal", formula = 10,
+ variance = 2)
+def <- defData(def, varname = "female", dist = "binary",
+ formula = "-2 + age * 0.1", link = "logit")
+def <- defData(def, varname = "visits", dist = "poisson",
+ formula = "1.5 - 0.2 * age + 0.5 * female", link = "log")
The data definition table includes a row for each variable that is to
be generated, and has the following fields: varname*
,
formula
, variance
, dist
, and
@@ -462,10 +462,10 @@
Generating the data
example, 1,000 observations are generated using the data set definitions
in def
, and then stored in the object
dd
:
-set.seed(87261)
-
-<- genData(1000, def)
- dd dd
+
## id age female visits
## 1: 1 9.78 0 0
## 2: 2 10.81 0 0
@@ -480,7 +480,7 @@ Generating the data
## 1000: 1000 10.80 1 2
If no data definition is provided, a simple data set is created with
just id’s.
-genData(1000)
+
## id
## 1: 1
## 2: 2
@@ -503,9 +503,9 @@ Assigning treatment/exposure
generate more involved types of study designs. In particular, with
trtAssign
, balanced and stratified designs are
possible.
-<- trtAssign(dd, n = 3, balanced = TRUE, strata = c("female"),
- study1 grpName = "rx")
- study1
+
## id age female visits rx
## 1: 1 9.78 0 0 3
## 2: 2 10.81 0 0 1
@@ -518,7 +518,7 @@ Assigning treatment/exposure
## 998: 998 6.84 0 1 1
## 999: 999 9.28 0 2 1
## 1000: 1000 10.80 1 2 3
-= .(female, rx)] study1[, .N, keyby
+
## female rx N
## 1: 0 1 249
## 2: 0 2 248
@@ -548,26 +548,26 @@ Formulas
function of both age and female. Since
age is the first row in the table, we had to use a
scalar to define the mean.
-<- defData(varname = "age", dist = "normal", formula = 10,
- def variance = 2)
- <- defData(def, varname = "female", dist = "binary",
- def formula = "-2 + age * 0.1", link = "logit")
- <- defData(def, varname = "visits", dist = "poisson",
- def formula = "1.5 - 0.2 * age + 0.5 * female", link = "log")
+def <- defData(varname = "age", dist = "normal", formula = 10,
+ variance = 2)
+def <- defData(def, varname = "female", dist = "binary",
+ formula = "-2 + age * 0.1", link = "logit")
+def <- defData(def, varname = "visits", dist = "poisson",
+ formula = "1.5 - 0.2 * age + 0.5 * female", link = "log")
Formulas can include R
functions or user-defined
functions. Here is an example with a user-defined function
myinv
and the log
function from base
R
:
-<- function(x) {
- myinv 1/x
-
- }
-<- defData(varname = "age", formula = 10, variance = 2,
- def dist = "normal")
- <- defData(def, varname = "loginvage", formula = "log(myinv(age))",
- def variance = 0.1, dist = "normal")
-
-genData(5, def)
+myinv <- function(x) {
+ 1/x
+}
+
+def <- defData(varname = "age", formula = 10, variance = 2,
+ dist = "normal")
+def <- defData(def, varname = "loginvage", formula = "log(myinv(age))",
+ variance = 0.1, dist = "normal")
+
+genData(5, def)
## id age loginvage
## 1: 1 10.31 -2.58
## 2: 2 7.90 -1.94
@@ -581,12 +581,12 @@ Formulas
of a data definition table. In this case, we are changing the formula of
loginvage in def so that it uses the
function log10
instead of log
:
-<- updateDef(def, changevar = "loginvage", newformula = "log10(myinv(age))")
- def10 def10
+
## varname formula variance dist link
## 1: age 10 2 normal identity
## 2: loginvage log10(myinv(age)) 0.1 normal identity
-genData(5, def10)
+
## id age loginvage
## 1: 1 9.82 -0.338
## 2: 2 10.97 -0.633
@@ -597,32 +597,32 @@ Formulas
dynamic definition tables is the external reference capability through
the double-dot notation. Any variable reference in a formula
that is preceded by “..” refers to an externally defined variable:
-<- 3
- age_effect
-<- defData(varname = "age", formula = 10, variance = 2,
- def dist = "normal")
- <- defData(def, varname = "agemult", formula = "age * ..age_effect",
- def dist = "nonrandom")
-
- def
+age_effect <- 3
+
+def <- defData(varname = "age", formula = 10, variance = 2,
+ dist = "normal")
+def <- defData(def, varname = "agemult", formula = "age * ..age_effect",
+ dist = "nonrandom")
+
+def
## varname formula variance dist link
## 1: age 10 2 normal identity
## 2: agemult age * ..age_effect 0 nonrandom identity
-genData(2, def)
+
## id age agemult
## 1: 1 9.69 29.1
## 2: 2 9.63 28.9
But the real power of dynamic definition is seen in the context of
iteration over multiple values:
-<- c(0, 5, 10)
- age_effects <- list()
- list_of_data
-for (i in seq_along(age_effects)) {
-<- age_effects[i]
- age_effect <- genData(2, def)
- list_of_data[[i]]
- }
- list_of_data
+age_effects <- c(0, 5, 10)
+list_of_data <- list()
+
+for (i in seq_along(age_effects)) {
+ age_effect <- age_effects[i]
+ list_of_data[[i]] <- genData(2, def)
+}
+
+list_of_data
## [[1]]
## id age agemult
## 1: 1 11.4 0
@@ -1026,14 +1026,14 @@ Generating multiple variables with a single definition
specified as they are in the defData
function. Calls to
defRepeat
can be integrated with calls to
defData
.
-<- defRepeat(nVars = 4, prefix = "g", formula = "1/3;1/3;1/3",
- def variance = 0, dist = "categorical")
- <- defData(def, varname = "a", formula = "1;1", dist = "trtAssign")
- def <- defRepeat(def, 3, "b", formula = "5 + a", variance = 3,
- def dist = "normal")
- <- defData(def, "y", formula = "0.10", dist = "binary")
- def
- def
+def <- defRepeat(nVars = 4, prefix = "g", formula = "1/3;1/3;1/3",
+ variance = 0, dist = "categorical")
+def <- defData(def, varname = "a", formula = "1;1", dist = "trtAssign")
+def <- defRepeat(def, 3, "b", formula = "5 + a", variance = 3,
+ dist = "normal")
+def <- defData(def, "y", formula = "0.10", dist = "binary")
+
+def
## varname formula variance dist link
## 1: g1 1/3;1/3;1/3 0 categorical identity
## 2: g2 1/3;1/3;1/3 0 categorical identity
@@ -1067,26 +1067,26 @@ defDataAdd/defRepeatAdd/readDataAdd and addColumns
in these “add-ing” functions are permitted to refer to fields
that exist in the data set to be augmented, so all variables need not be
defined in the current definition able.
-<- defData(varname = "x1", formula = 0, variance = 1,
- d1 dist = "normal")
- <- defData(d1, varname = "x2", formula = 0.5, dist = "binary")
- d1
-<- defRepeatAdd(nVars = 2, prefix = "q", formula = "5 + 3*rx",
- d2 variance = 4, dist = "normal")
- <- defDataAdd(d2, varname = "y", formula = "-2 + 0.5*x1 + 0.5*x2 + 1*rx",
- d2 dist = "binary", link = "logit")
-
-<- genData(5, d1)
- dd <- trtAssign(dd, nTrt = 2, grpName = "rx")
- dd dd
+d1 <- defData(varname = "x1", formula = 0, variance = 1,
+ dist = "normal")
+d1 <- defData(d1, varname = "x2", formula = 0.5, dist = "binary")
+
+d2 <- defRepeatAdd(nVars = 2, prefix = "q", formula = "5 + 3*rx",
+ variance = 4, dist = "normal")
+d2 <- defDataAdd(d2, varname = "y", formula = "-2 + 0.5*x1 + 0.5*x2 + 1*rx",
+ dist = "binary", link = "logit")
+
+dd <- genData(5, d1)
+dd <- trtAssign(dd, nTrt = 2, grpName = "rx")
+dd
## id x1 x2 rx
## 1: 1 -1.3230 1 0
## 2: 2 -0.0494 0 1
## 3: 3 -0.4064 1 0
## 4: 4 -0.5562 1 0
## 5: 5 -0.0941 0 1
-<- addColumns(d2, dd)
- dd dd
+
## id x1 x2 rx q1 q2 y
## 1: 1 -1.3230 1 0 4.589 5.70 0
## 2: 2 -0.0494 0 1 9.829 11.74 1
@@ -1108,17 +1108,17 @@ defCondition and addCondition
variance
, and link
arguments.
In this example, the slope of a regression line of \(y\) on \(x\) varies depending on the value of the
predictor \(x\):
-<- defData(varname = "x", formula = 0, variance = 9, dist = "normal")
- d
-<- defCondition(condition = "x <= -2", formula = "4 + 3*x",
- dc variance = 2, dist = "normal")
- <- defCondition(dc, condition = "x > -2 & x <= 2", formula = "0 + 1*x",
- dc variance = 4, dist = "normal")
- <- defCondition(dc, condition = "x > 2", formula = "-5 + 4*x",
- dc variance = 3, dist = "normal")
-
-<- genData(1000, d)
- dd <- addCondition(dc, dd, newvar = "y") dd
+d <- defData(varname = "x", formula = 0, variance = 9, dist = "normal")
+
+dc <- defCondition(condition = "x <= -2", formula = "4 + 3*x",
+ variance = 2, dist = "normal")
+dc <- defCondition(dc, condition = "x > -2 & x <= 2", formula = "0 + 1*x",
+ variance = 4, dist = "normal")
+dc <- defCondition(dc, condition = "x > 2", formula = "-5 + 4*x",
+ variance = 3, dist = "normal")
+
+dd <- genData(1000, d)
+dd <- addCondition(dc, dd, newvar = "y")
diff --git a/inst/doc/spline.R b/inst/doc/spline.R
index dd446ee..15c6adf 100644
--- a/inst/doc/spline.R
+++ b/inst/doc/spline.R
@@ -1,4 +1,7 @@
-## ---- echo = FALSE, message = FALSE-------------------------------------------
+## ----chunkname, echo=-1-------------------------------------------------------
+data.table::setDTthreads(2)
+
+## ----echo = FALSE, message = FALSE--------------------------------------------
library(simstudy)
library(ggplot2)
library(scales)
@@ -29,7 +32,7 @@ ggtheme <- function(panelback = "white") {
}
-## ---- fig.width = 6, fig.height = 2.5-----------------------------------------
+## ----fig.width = 6, fig.height = 2.5------------------------------------------
knots <- c(0.25, 0.50, 0.75)
viewBasis(knots, degree = 2)
@@ -37,7 +40,7 @@ knots <- c(0.20, 0.40, 0.60, 0.80)
viewBasis(knots, degree = 3)
-## ---- fig.width = 6, fig.height = 2.5-----------------------------------------
+## ----fig.width = 6, fig.height = 2.5------------------------------------------
knots <- c(0.25, 0.5, 0.75)
# number of elements in theta: length(knots) + degree + 1
@@ -61,7 +64,7 @@ ddef <- defData(varname = "age", formula = "20;60", dist = "uniform")
theta1 = c(0.1, 0.8, 0.6, 0.4, 0.6, 0.9, 0.9)
knots <- c(0.25, 0.5, 0.75)
-## ---- fig.width = 6, fig.height = 2.5-----------------------------------------
+## ----fig.width = 6, fig.height = 2.5------------------------------------------
viewSplines(knots = knots, theta = theta1, degree = 3)
## -----------------------------------------------------------------------------
@@ -74,14 +77,14 @@ dt <- genSpline(dt = dt, newvar = "weight",
newrange = "90;160",
noise.var = 64)
-## ---- fig.width = 6, fig.height = 3, message = FALSE--------------------------
+## ----fig.width = 6, fig.height = 3, message = FALSE---------------------------
ggplot(data = dt, aes(x=age, y=weight)) +
geom_point(color = "grey65", size = 0.75) +
geom_smooth(se=FALSE, color="red", size = 1, method = "auto") +
geom_vline(xintercept = quantile(dt$age, knots)) +
theme(panel.grid.minor = element_blank())
-## ---- fig.width = 6, fig.height = 3-------------------------------------------
+## ----fig.width = 6, fig.height = 3--------------------------------------------
# normalize age for best basis functions
dt[, nage := (age - min(age))/(max(age) - min(age))]
diff --git a/inst/doc/spline.Rmd b/inst/doc/spline.Rmd
index 5a18e33..8d07ae4 100644
--- a/inst/doc/spline.Rmd
+++ b/inst/doc/spline.Rmd
@@ -7,6 +7,10 @@ vignette: >
\usepackage[utf8]{inputenc}
---
+```{r chunkname, echo=-1}
+data.table::setDTthreads(2)
+```
+
```{r, echo = FALSE, message = FALSE}
library(simstudy)
library(ggplot2)
diff --git a/inst/doc/spline.html b/inst/doc/spline.html
index f5a050a..f16375a 100644
--- a/inst/doc/spline.html
+++ b/inst/doc/spline.html
@@ -358,11 +358,11 @@ Spline Data
First, we can look at the basis functions, which depend only the
knots and degree. The knots are specified as quantiles, between 0 and
1:
-<- c(0.25, 0.50, 0.75)
- knots viewBasis(knots, degree = 2)
+
-<- c(0.20, 0.40, 0.60, 0.80)
- knots viewBasis(knots, degree = 3)
+
The splines themselves are specified as linear combinations of each
of the basis functions. The coefficients of those combinations are
@@ -371,19 +371,19 @@
Spline Data
exploring, we can look at a single curve or multiple curves, depending
on whether or not we specify theta as a vector (single) or matrix
(multiple).
-<- c(0.25, 0.5, 0.75)
- knots
-# number of elements in theta: length(knots) + degree + 1
-= c(0.1, 0.8, 0.4, 0.9, 0.2, 1.0)
- theta1
-viewSplines(knots, degree = 2, theta1)
+knots <- c(0.25, 0.5, 0.75)
+
+# number of elements in theta: length(knots) + degree + 1
+theta1 = c(0.1, 0.8, 0.4, 0.9, 0.2, 1.0)
+
+viewSplines(knots, degree = 2, theta1)
-= matrix(c(0.1, 0.2, 0.4, 0.9, 0.2, 0.3, 0.6,
- theta2 0.1, 0.3, 0.3, 0.8, 1.0, 0.9, 0.4,
- 0.1, 0.9, 0.8, 0.2, 0.1, 0.6, 0.1),
- ncol = 3)
-
- theta2
+theta2 = matrix(c(0.1, 0.2, 0.4, 0.9, 0.2, 0.3, 0.6,
+ 0.1, 0.3, 0.3, 0.8, 1.0, 0.9, 0.4,
+ 0.1, 0.9, 0.8, 0.2, 0.1, 0.6, 0.1),
+ ncol = 3)
+
+theta2
## [,1] [,2] [,3]
## [1,] 0.1 0.1 0.1
## [2,] 0.2 0.3 0.9
@@ -392,63 +392,63 @@ Spline Data
## [5,] 0.2 1.0 0.1
## [6,] 0.3 0.9 0.6
## [7,] 0.6 0.4 0.1
-viewSplines(knots, degree = 3, theta2)
+
We can generate data using a predictor in an existing data set by
specifying the knots (in terms of quantiles), a vector of
coefficients in theta, the degree of the polynomial, as well as
a range
-<- defData(varname = "age", formula = "20;60", dist = "uniform")
- ddef
-= c(0.1, 0.8, 0.6, 0.4, 0.6, 0.9, 0.9)
- theta1 <- c(0.25, 0.5, 0.75) knots
+ddef <- defData(varname = "age", formula = "20;60", dist = "uniform")
+
+theta1 = c(0.1, 0.8, 0.6, 0.4, 0.6, 0.9, 0.9)
+knots <- c(0.25, 0.5, 0.75)
Here is the shape of the curve that we want to generate data
from:
-viewSplines(knots = knots, theta = theta1, degree = 3)
+
Now we specify the variables in the data set and generate the
data:
-set.seed(234)
-
-<- genData(1000, ddef)
- dt <- genSpline(dt = dt, newvar = "weight",
- dt predictor = "age", theta = theta1,
- knots = knots, degree = 3,
- newrange = "90;160",
- noise.var = 64)
+set.seed(234)
+
+dt <- genData(1000, ddef)
+dt <- genSpline(dt = dt, newvar = "weight",
+ predictor = "age", theta = theta1,
+ knots = knots, degree = 3,
+ newrange = "90;160",
+ noise.var = 64)
Here’s a plot of the data with a smoothed line fit to the data:
-ggplot(data = dt, aes(x=age, y=weight)) +
-geom_point(color = "grey65", size = 0.75) +
- geom_smooth(se=FALSE, color="red", size = 1, method = "auto") +
- geom_vline(xintercept = quantile(dt$age, knots)) +
- theme(panel.grid.minor = element_blank())
+ggplot(data = dt, aes(x=age, y=weight)) +
+ geom_point(color = "grey65", size = 0.75) +
+ geom_smooth(se=FALSE, color="red", size = 1, method = "auto") +
+ geom_vline(xintercept = quantile(dt$age, knots)) +
+ theme(panel.grid.minor = element_blank())
Finally, we will fit three different spline models to the data - a
linear, a quadratic, and a cubic - and plot the predicted values:
-# normalize age for best basis functions
-:= (age - min(age))/(max(age) - min(age))]
- dt[, nage
-# fit a cubic spline
-<- lm(weight ~ bs(x = nage, knots = knots, degree = 3, intercept = TRUE) - 1, data = dt)
- lmfit3
-# fit a quadtratic spline
-<- lm(weight ~ bs(x = nage, knots = knots, degree = 2), data = dt)
- lmfit2
-# fit a linear spline
-<- lm(weight ~ bs(x = nage, knots = knots, degree = 1), data = dt)
- lmfit1
-# add predicted values for plotting
-.3deg := predict(lmfit3)]
- dt[, pred.2deg := predict(lmfit2)]
- dt[, pred.1deg := predict(lmfit1)]
- dt[, pred
-ggplot(data = dt, aes(x=age, y=weight)) +
-geom_point(color = "grey65", size = 0.75) +
- geom_line(aes(x=age, y = pred.3deg), color = "#1B9E77", size = 1) +
- geom_line(aes(x=age, y = pred.2deg), color = "#D95F02", size = 1) +
- geom_line(aes(x=age, y = pred.1deg), color = "#7570B3", size = 1) +
- geom_vline(xintercept = quantile(dt$age, knots)) +
- theme(panel.grid.minor = element_blank())
+# normalize age for best basis functions
+dt[, nage := (age - min(age))/(max(age) - min(age))]
+
+# fit a cubic spline
+lmfit3 <- lm(weight ~ bs(x = nage, knots = knots, degree = 3, intercept = TRUE) - 1, data = dt)
+
+# fit a quadtratic spline
+lmfit2 <- lm(weight ~ bs(x = nage, knots = knots, degree = 2), data = dt)
+
+# fit a linear spline
+lmfit1 <- lm(weight ~ bs(x = nage, knots = knots, degree = 1), data = dt)
+
+# add predicted values for plotting
+dt[, pred.3deg := predict(lmfit3)]
+dt[, pred.2deg := predict(lmfit2)]
+dt[, pred.1deg := predict(lmfit1)]
+
+ggplot(data = dt, aes(x=age, y=weight)) +
+ geom_point(color = "grey65", size = 0.75) +
+ geom_line(aes(x=age, y = pred.3deg), color = "#1B9E77", size = 1) +
+ geom_line(aes(x=age, y = pred.2deg), color = "#D95F02", size = 1) +
+ geom_line(aes(x=age, y = pred.1deg), color = "#7570B3", size = 1) +
+ geom_vline(xintercept = quantile(dt$age, knots)) +
+ theme(panel.grid.minor = element_blank())
diff --git a/inst/doc/survival.R b/inst/doc/survival.R
index c5b455a..233350e 100644
--- a/inst/doc/survival.R
+++ b/inst/doc/survival.R
@@ -1,4 +1,7 @@
-## ---- echo = FALSE, message = FALSE-------------------------------------------
+## ----chunkname, echo=-1-------------------------------------------------------
+data.table::setDTthreads(2)
+
+## ----echo = FALSE, message = FALSE--------------------------------------------
library(simstudy)
# library(ggplot2)
library(scales)
@@ -8,7 +11,7 @@ library(survival)
library(gee)
library(data.table)
-## ---- echo = FALSE------------------------------------------------------------
+## ----echo = FALSE-------------------------------------------------------------
plotcolors <- c("#B84226", "#1B8445", "#1C5974")
cbbPalette <- c("#B84226","#B88F26", "#A5B435", "#1B8446",
@@ -29,7 +32,7 @@ ggtheme <- function(panelback = "white") {
}
-## ---- tidy = TRUE-------------------------------------------------------------
+## ----tidy = TRUE--------------------------------------------------------------
# Baseline data definitions
@@ -46,7 +49,7 @@ sdef <- defSurv(sdef, varname = "censorTime", scale = 80, shape = 1)
sdef
-## ---- tidy = TRUE-------------------------------------------------------------
+## ----tidy = TRUE--------------------------------------------------------------
# Baseline data definitions
@@ -60,7 +63,7 @@ head(dtSurv)
dtSurv[,round(mean(survTime),1), keyby = .(grp,x1)]
-## ---- tidy = TRUE-------------------------------------------------------------
+## ----tidy = TRUE--------------------------------------------------------------
dtSurv <- genData(300, def)
dtSurv <- genSurv(dtSurv, sdef, timeName = "obsTime",
censorName = "censorTime", eventName = "status",
@@ -72,7 +75,7 @@ head(dtSurv)
dtSurv[,round(1-mean(status),2), keyby = .(grp,x1)]
-## ---- tidy = TRUE, echo = FALSE, fig.width = 6.5, fig.height = 3.5, warning=FALSE----
+## ----tidy = TRUE, echo = FALSE, fig.width = 6.5, fig.height = 3.5, warning=FALSE----
fit <- survfit(Surv(obsTime, status) ~ x1+grp, data=dtSurv)
survminer::ggsurvplot(fit, data = dtSurv,
@@ -83,7 +86,7 @@ survminer::ggsurvplot(fit, data = dtSurv,
# panel.grid = ggplot2::element_blank())
)
-## ---- tidy = TRUE-------------------------------------------------------------
+## ----tidy = TRUE--------------------------------------------------------------
# Baseline data definitions
@@ -101,7 +104,7 @@ dtSurv <- genSurv(dtSurv, sdef, timeName = "obsTime",
coxfit <- survival::coxph(Surv(obsTime, status) ~ x1 + x2, data = dtSurv)
-## ---- echo=FALSE--------------------------------------------------------------
+## ----echo=FALSE---------------------------------------------------------------
gtsummary::tbl_regression(coxfit)
## -----------------------------------------------------------------------------
@@ -122,7 +125,7 @@ dtSurv <- addCompRisk(dtSurv, events = c("event_1", "event_2", "censor"),
timeName = "time", censorName = "censor")
dtSurv
-## ---- tidy = TRUE, echo = FALSE, fig.width = 6.5, fig.height = 3.5, warning=FALSE----
+## ----tidy = TRUE, echo = FALSE, fig.width = 6.5, fig.height = 3.5, warning=FALSE----
fit <- survfit(Surv(time, event, type="mstate") ~ 1, data=dtSurv)
survminer::ggcompetingrisks(fit, ggtheme = ggtheme("grey94")) +
ggplot2::scale_fill_manual(values = cbbPalette)
@@ -132,7 +135,7 @@ dtSurv <- genData(101, d1)
dtSurv <- genSurv(dtSurv, dS, timeName = "time", censorName = "censor")
dtSurv
-## ---- tidy = TRUE-------------------------------------------------------------
+## ----tidy = TRUE--------------------------------------------------------------
def <- defData(varname = "x", formula = .4, dist="binary")
defS <- defSurv(varname = "death", formula = "-14.6 - 0.7*x", shape = .35)
@@ -143,7 +146,7 @@ dd <- genSurv(dd, defS, digits = 2, timeName = "time", censorName = "censor")
fit <- survfit( Surv(time, event) ~ x, data = dd )
-## ---- tidy = TRUE, echo = FALSE, fig.width = 6.5, fig.height = 3.5, warning=FALSE----
+## ----tidy = TRUE, echo = FALSE, fig.width = 6.5, fig.height = 3.5, warning=FALSE----
survminer::ggsurvplot(fit, data = dd,
ggtheme = ggtheme("grey94"),
palette = cbbPalette
@@ -152,13 +155,13 @@ survminer::ggsurvplot(fit, data = dd,
## -----------------------------------------------------------------------------
coxfit <- coxph(formula = Surv(time, event) ~ x, data = dd)
-## ---- echo=FALSE--------------------------------------------------------------
+## ----echo=FALSE---------------------------------------------------------------
gtsummary::tbl_regression(coxfit)
## -----------------------------------------------------------------------------
cox.zph(coxfit)
-## ---- tidy = TRUE-------------------------------------------------------------
+## ----tidy = TRUE--------------------------------------------------------------
def <- defData(varname = "x", formula = .4, dist="binary")
defS <- defSurv(varname = "death", formula = "-14.6 - 1.3*x", shape = .35, transition = 0)
@@ -170,7 +173,7 @@ dd <- genSurv(dd, defS, digits = 2, timeName = "time", censorName = "censor")
fit <- survfit( Surv(time, event) ~ x, data = dd )
-## ---- tidy = TRUE, echo = FALSE, fig.width = 6.5, fig.height = 3.5, warning=FALSE----
+## ----tidy = TRUE, echo = FALSE, fig.width = 6.5, fig.height = 3.5, warning=FALSE----
survminer::ggsurvplot(fit, data = dd,
ggtheme = ggtheme("grey94"),
palette = cbbPalette
@@ -179,7 +182,7 @@ survminer::ggsurvplot(fit, data = dd,
## -----------------------------------------------------------------------------
coxfit <- survival::coxph(formula = Surv(time, event) ~ x, data = dd)
-## ---- echo=FALSE--------------------------------------------------------------
+## ----echo=FALSE---------------------------------------------------------------
gtsummary::tbl_regression(coxfit)
## -----------------------------------------------------------------------------
@@ -191,7 +194,7 @@ dd2 <- survSplit(Surv(time, event) ~ ., data= dd, cut=c(150),
coxfit2 <- survival::coxph(Surv(tstart, time, event) ~ x:strata(tgroup), data=dd2)
-## ---- echo=FALSE--------------------------------------------------------------
+## ----echo=FALSE---------------------------------------------------------------
gtsummary::tbl_regression(coxfit2)
## -----------------------------------------------------------------------------
@@ -202,7 +205,7 @@ points <- list(c(100, 0.80), c(200, 0.10))
r <- survGetParams(points)
r
-## ---- tidy = TRUE, fig.width = 6.5, fig.height = 3.5, warning=FALSE-----------
+## ----tidy = TRUE, fig.width = 6.5, fig.height = 3.5, warning=FALSE------------
survParamPlot(f = r[1], shape = r[2], points)
## -----------------------------------------------------------------------------
@@ -213,7 +216,7 @@ defS <- defSurv(defS, varname = "censor", formula = 0, scale = exp(18.5), shape
dd <- genData(500)
dd <- genSurv(dd, defS, timeName = "time", censorName = "censor")
-## ---- tidy = TRUE, echo = FALSE, fig.width = 6.5, fig.height = 3.5, warning=FALSE----
+## ----tidy = TRUE, echo = FALSE, fig.width = 6.5, fig.height = 3.5, warning=FALSE----
fit <- survfit( Surv(time, event) ~ 1, data = dd )
survminer::ggsurvplot(fit, data = dd,
diff --git a/inst/doc/survival.Rmd b/inst/doc/survival.Rmd
index c59f1b6..25b48ab 100644
--- a/inst/doc/survival.Rmd
+++ b/inst/doc/survival.Rmd
@@ -7,6 +7,9 @@ vignette: >
\usepackage[utf8]{inputenc}
---
+```{r chunkname, echo=-1}
+data.table::setDTthreads(2)
+```
```{r, echo = FALSE, message = FALSE}
diff --git a/inst/doc/survival.html b/inst/doc/survival.html
index 43d4e27..01b8caa 100644
--- a/inst/doc/survival.html
+++ b/inst/doc/survival.html
@@ -382,31 +382,31 @@ Generating standard survival data with censoring
Here is an example showing how to generate data with covariates. In
this case the scale and shape parameters will vary by group
membership.
-# Baseline data definitions
-
-<- defData(varname = "x1", formula = 0.5, dist = "binary")
- def <- defData(def, varname = "grp", formula = 0.5, dist = "binary")
- def
-# Survival data definitions
-
-set.seed(282716)
-
-<- defSurv(varname = "survTime", formula = "1.5*x1", scale = "grp*50 + (1-grp)*25",
- sdef shape = "grp*1 + (1-grp)*1.5")
- <- defSurv(sdef, varname = "censorTime", scale = 80, shape = 1)
- sdef
- sdef
+# Baseline data definitions
+
+def <- defData(varname = "x1", formula = 0.5, dist = "binary")
+def <- defData(def, varname = "grp", formula = 0.5, dist = "binary")
+
+# Survival data definitions
+
+set.seed(282716)
+
+sdef <- defSurv(varname = "survTime", formula = "1.5*x1", scale = "grp*50 + (1-grp)*25",
+ shape = "grp*1 + (1-grp)*1.5")
+sdef <- defSurv(sdef, varname = "censorTime", scale = 80, shape = 1)
+
+sdef
## varname formula scale shape transition
## 1: censorTime 0 80 1 0
## 2: survTime 1.5*x1 grp*50 + (1-grp)*25 grp*1 + (1-grp)*1.5 0
The data are generated with calls to genData
and
genSurv
:
-# Baseline data definitions
-
-<- genData(300, def)
- dtSurv <- genSurv(dtSurv, sdef)
- dtSurv
-head(dtSurv)
+# Baseline data definitions
+
+dtSurv <- genData(300, def)
+dtSurv <- genSurv(dtSurv, sdef)
+
+head(dtSurv)
## id x1 grp censorTime survTime
## 1: 1 0 0 42.9 9.88
## 2: 2 0 1 77.2 17.34
@@ -414,9 +414,9 @@ Generating standard survival data with censoring
## 4: 4 1 1 181.9 16.47
## 5: 5 1 0 210.9 108.28
## 6: 6 0 1 34.1 8.12
-# A comparison of survival by group and x1
-
-round(mean(survTime), 1), keyby = .(grp, x1)] dtSurv[,
+# A comparison of survival by group and x1
+
+dtSurv[, round(mean(survTime), 1), keyby = .(grp, x1)]
## grp x1 V1
## 1: 0 0 149.2
## 2: 0 1 23.4
@@ -425,11 +425,11 @@ Generating standard survival data with censoring
Observed survival times and censoring indicators can be generated
using the competing risk functionality and specifying a censoring
variable:
-<- genData(300, def)
- dtSurv <- genSurv(dtSurv, sdef, timeName = "obsTime", censorName = "censorTime",
- dtSurv eventName = "status", keepEvents = TRUE)
-
-head(dtSurv)
+dtSurv <- genData(300, def)
+dtSurv <- genSurv(dtSurv, sdef, timeName = "obsTime", censorName = "censorTime",
+ eventName = "status", keepEvents = TRUE)
+
+head(dtSurv)
## id x1 grp censorTime survTime obsTime status type
## 1: 1 0 1 92.4 49.071 49.071 1 survTime
## 2: 2 1 0 45.8 25.639 25.639 1 survTime
@@ -437,9 +437,9 @@ Generating standard survival data with censoring
## 4: 4 0 0 12.7 13.325 12.663 0 censorTime
## 5: 5 0 0 26.5 323.764 26.531 0 censorTime
## 6: 6 1 0 23.5 0.157 0.157 1 survTime
-# estimate proportion of censoring by x1 and group
-
-round(1 - mean(status), 2), keyby = .(grp, x1)] dtSurv[,
+# estimate proportion of censoring by x1 and group
+
+dtSurv[, round(1 - mean(status), 2), keyby = .(grp, x1)]
## grp x1 V1
## 1: 0 0 0.71
## 2: 0 1 0.10
@@ -449,21 +449,21 @@ Generating standard survival data with censoring
Here is a survival analysis (using a Cox proportional hazard model)
of a slightly simplified data set with two baseline covariates only:
-# Baseline data definitions
-
-<- defData(varname = "x1", formula = 0.5, dist = "binary")
- def <- defData(def, varname = "x2", formula = 0.5, dist = "binary")
- def
-# Survival data definitions
-
-<- defSurv(varname = "survTime", formula = "1.5*x1 - .8*x2", scale = 50, shape = 1/2)
- sdef <- defSurv(sdef, varname = "censorTime", scale = 80, shape = 1)
- sdef
-<- genData(300, def)
- dtSurv <- genSurv(dtSurv, sdef, timeName = "obsTime", censorName = "censorTime",
- dtSurv eventName = "status")
-
-<- survival::coxph(Surv(obsTime, status) ~ x1 + x2, data = dtSurv) coxfit
+# Baseline data definitions
+
+def <- defData(varname = "x1", formula = 0.5, dist = "binary")
+def <- defData(def, varname = "x2", formula = 0.5, dist = "binary")
+
+# Survival data definitions
+
+sdef <- defSurv(varname = "survTime", formula = "1.5*x1 - .8*x2", scale = 50, shape = 1/2)
+sdef <- defSurv(sdef, varname = "censorTime", scale = 80, shape = 1)
+
+dtSurv <- genData(300, def)
+dtSurv <- genSurv(dtSurv, sdef, timeName = "obsTime", censorName = "censorTime",
+ eventName = "status")
+
+coxfit <- survival::coxph(Surv(obsTime, status) ~ x1 + x2, data = dtSurv)
The 95% confidence intervals of the parameter estimates include the
values used to generate the data:
@@ -899,64 +899,64 @@ Competing risks
addCompRisk
that will generate the competing risk
information using an existing data set. The example here will take that
approach.
-<- defData(varname = "x1", formula = .5, dist = "binary")
- d1 <- defData(d1, "x2", .5, dist = "binary")
- d1
-<- defSurv(varname = "event_1", formula = "-10 - 0.6*x1 + 0.4*x2", shape = 0.3)
- dS <- defSurv(dS, "event_2", "-6.5 + 0.3*x1 - 0.5*x2", shape = 0.5)
- dS <- defSurv(dS, "censor", "-7", shape = 0.55)
- dS
-<- genData(1001, d1)
- dtSurv <- genSurv(dtSurv, dS)
- dtSurv
- dtSurv
+d1 <- defData(varname = "x1", formula = .5, dist = "binary")
+d1 <- defData(d1, "x2", .5, dist = "binary")
+
+dS <- defSurv(varname = "event_1", formula = "-10 - 0.6*x1 + 0.4*x2", shape = 0.3)
+dS <- defSurv(dS, "event_2", "-6.5 + 0.3*x1 - 0.5*x2", shape = 0.5)
+dS <- defSurv(dS, "censor", "-7", shape = 0.55)
+
+dtSurv <- genData(1001, d1)
+dtSurv <- genSurv(dtSurv, dS)
+
+dtSurv
## id x1 x2 censor event_1 event_2
-## 1: 1 0 0 81.38 32.80 21.85
-## 2: 2 0 0 44.71 11.22 28.56
-## 3: 3 0 1 5.61 22.91 52.99
-## 4: 4 1 1 14.85 22.18 4.72
-## 5: 5 0 0 43.83 13.58 36.07
+## 1: 1 0 1 39.66 17.8 23.62
+## 2: 2 1 0 101.14 24.1 32.41
+## 3: 3 1 0 13.18 40.5 36.50
+## 4: 4 0 0 26.97 18.1 45.12
+## 5: 5 0 1 15.32 14.2 37.96
## ---
-## 997: 997 1 1 14.61 14.76 26.14
-## 998: 998 1 1 30.19 6.42 16.13
-## 999: 999 1 0 7.54 26.09 20.33
-## 1000: 1000 1 0 19.50 27.08 6.43
-## 1001: 1001 1 0 31.74 11.95 22.59
-<- addCompRisk(dtSurv, events = c("event_1", "event_2", "censor"),
- dtSurv timeName = "time", censorName = "censor")
- dtSurv
+## 997: 997 0 0 7.04 16.5 28.58
+## 998: 998 0 0 25.29 19.8 51.29
+## 999: 999 0 0 45.01 14.9 29.27
+## 1000: 1000 1 0 11.11 15.4 7.78
+## 1001: 1001 1 0 69.18 24.3 18.32
+dtSurv <- addCompRisk(dtSurv, events = c("event_1", "event_2", "censor"),
+ timeName = "time", censorName = "censor")
+dtSurv
## id x1 x2 time event type
-## 1: 1 0 0 21.85 2 event_2
-## 2: 2 0 0 11.22 1 event_1
-## 3: 3 0 1 5.61 0 censor
-## 4: 4 1 1 4.72 2 event_2
-## 5: 5 0 0 13.58 1 event_1
+## 1: 1 0 1 17.78 1 event_1
+## 2: 2 1 0 24.09 1 event_1
+## 3: 3 1 0 13.18 0 censor
+## 4: 4 0 0 18.08 1 event_1
+## 5: 5 0 1 14.17 1 event_1
## ---
-## 997: 997 1 1 14.61 0 censor
-## 998: 998 1 1 6.42 1 event_1
-## 999: 999 1 0 7.54 0 censor
-## 1000: 1000 1 0 6.43 2 event_2
-## 1001: 1001 1 0 11.95 1 event_1
+## 997: 997 0 0 7.04 0 censor
+## 998: 998 0 0 19.84 1 event_1
+## 999: 999 0 0 14.89 1 event_1
+## 1000: 1000 1 0 7.78 2 event_2
+## 1001: 1001 1 0 18.32 2 event_2
The competing risk data can be plotted using the cumulative incidence
functions (rather than the survival curves):
-
+
The data generation can all be done in two (instead of three)
steps:
-<- genData(101, d1)
- dtSurv <- genSurv(dtSurv, dS, timeName = "time", censorName = "censor")
- dtSurv dtSurv
-## id x1 x2 time event type
-## 1: 1 0 1 4.83 1 event_1
-## 2: 2 0 1 22.42 1 event_1
-## 3: 3 1 0 9.16 1 event_1
-## 4: 4 1 0 25.13 1 event_1
-## 5: 5 1 0 10.51 1 event_1
-## ---
-## 97: 97 0 0 13.88 2 event_2
-## 98: 98 0 1 19.53 1 event_1
-## 99: 99 0 0 15.10 0 censor
-## 100: 100 0 1 10.99 2 event_2
-## 101: 101 1 1 13.08 1 event_1
+dtSurv <- genData(101, d1)
+dtSurv <- genSurv(dtSurv, dS, timeName = "time", censorName = "censor")
+dtSurv
+## id x1 x2 time event type
+## 1: 1 0 1 11.9 1 event_1
+## 2: 2 0 0 20.4 1 event_1
+## 3: 3 1 1 14.5 2 event_2
+## 4: 4 1 0 12.2 2 event_2
+## 5: 5 1 1 16.6 1 event_1
+## ---
+## 97: 97 1 1 17.9 2 event_2
+## 98: 98 1 0 15.8 2 event_2
+## 99: 99 1 1 20.2 1 event_1
+## 100: 100 0 0 20.2 1 event_1
+## 101: 101 1 0 14.9 1 event_1
Introducing non-proportional hazards
@@ -985,33 +985,33 @@ Introducing non-proportional hazards
Constant/proportional hazard ratio
To start, here is an example assuming a constant log hazard ratio of
-0.7:
-<- defData(varname = "x", formula = 0.4, dist = "binary")
- def
-<- defSurv(varname = "death", formula = "-14.6 - 0.7*x", shape = 0.35)
- defS <- defSurv(defS, varname = "censor", scale = exp(13), shape = 0.5)
- defS
-<- genData(500, def)
- dd <- genSurv(dd, defS, digits = 2, timeName = "time", censorName = "censor")
- dd
-<- survfit(Surv(time, event) ~ x, data = dd) fit
-
+def <- defData(varname = "x", formula = 0.4, dist = "binary")
+
+defS <- defSurv(varname = "death", formula = "-14.6 - 0.7*x", shape = 0.35)
+defS <- defSurv(defS, varname = "censor", scale = exp(13), shape = 0.5)
+
+dd <- genData(500, def)
+dd <- genSurv(dd, defS, digits = 2, timeName = "time", censorName = "censor")
+
+fit <- survfit(Surv(time, event) ~ x, data = dd)
+
The Cox proportional hazards model recovers the correct log hazards
rate:
-<- coxph(formula = Surv(time, event) ~ x, data = dd) coxfit
-
-
@@ -1399,8 +1399,8 @@ Introducing non-proportional hazards
x
--0.72
--0.92, -0.52
+-0.69
+-0.88, -0.50
<0.001
@@ -1416,49 +1416,49 @@ Introducing non-proportional hazards
0.05\), then we would conclude that the assumption of
proportional hazards is not warranted. In this case \(p = 0.22\), so the model is apparently
reasonable:
-cox.zph(coxfit)
+
## chisq df p
-## x 1.51 1 0.22
-## GLOBAL 1.51 1 0.22
+## x 2.61 1 0.11
+## GLOBAL 2.61 1 0.11
Non-constant/non-proportional hazard ratio
In this next case, the risk of death when \(x=1\) is lower at all time points compared
to when \(x=0\), but the relative risk
(or hazard ratio) changes at 150 days:
-<- defData(varname = "x", formula = 0.4, dist = "binary")
- def
-<- defSurv(varname = "death", formula = "-14.6 - 1.3*x", shape = 0.35, transition = 0)
- defS <- defSurv(defS, varname = "death", formula = "-14.6 - 0.4*x", shape = 0.35,
- defS transition = 150)
- <- defSurv(defS, varname = "censor", scale = exp(13), shape = 0.5)
- defS
-<- genData(500, def)
- dd <- genSurv(dd, defS, digits = 2, timeName = "time", censorName = "censor")
- dd
-<- survfit(Surv(time, event) ~ x, data = dd) fit
+def <- defData(varname = "x", formula = 0.4, dist = "binary")
+
+defS <- defSurv(varname = "death", formula = "-14.6 - 1.3*x", shape = 0.35, transition = 0)
+defS <- defSurv(defS, varname = "death", formula = "-14.6 - 0.4*x", shape = 0.35,
+ transition = 150)
+defS <- defSurv(defS, varname = "censor", scale = exp(13), shape = 0.5)
+
+dd <- genData(500, def)
+dd <- genSurv(dd, defS, digits = 2, timeName = "time", censorName = "censor")
+
+fit <- survfit(Surv(time, event) ~ x, data = dd)
The survival curve for the sample with \(x=1\) has a slightly different shape under
this data generation process compared to the previous example under a
constant hazard ratio assumption; there is more separation early on
(prior to day 150), and then the gap is closed at a quicker rate.
-
+
If we ignore the possibility that there might be a different
relationship over time, the Cox proportional hazards model gives an
estimate of the log hazard ratio quite close to -0.70:
-<- survival::coxph(formula = Surv(time, event) ~ x, data = dd) coxfit
-
-
@@ -1846,8 +1846,8 @@ Introducing non-proportional hazards
x
--0.71
--0.90, -0.52
+-0.72
+-0.91, -0.52
<0.001
@@ -1861,33 +1861,33 @@ Introducing non-proportional hazards
However, further inspection of the proportionality assumption should
make us question the appropriateness of the model. Since \(p<0.05\), we would be wise to see if we
can improve on the model.
-cox.zph(coxfit)
+
## chisq df p
-## x 7.4 1 0.0065
-## GLOBAL 7.4 1 0.0065
+## x 9.26 1 0.0023
+## GLOBAL 9.26 1 0.0023
We might be able to see from the plot where proportionality diverges,
in which case we can split the data set into two parts at the identified
time point. (In many cases, the transition point or points won’t be so
obvious, in which case the investigation might be more involved.) By
splitting the data at day 150, we get the desired estimates:
-<- survSplit(Surv(time, event) ~ ., data= dd, cut=c(150),
- dd2 episode= "tgroup", id="newid")
-
-<- survival::coxph(Surv(tstart, time, event) ~ x:strata(tgroup), data=dd2) coxfit2
-
-
@@ -2275,17 +2275,17 @@ Introducing non-proportional hazards
x * strata(tgroup)
-
-
-
+
+
+
x * tgroup=1
-1.3
--1.6, -0.93
+-1.7, -0.94
<0.001
x * tgroup=2
--0.38
--0.62, -0.13
-0.003
+-0.40
+-0.65, -0.16
+0.001
@@ -2297,10 +2297,10 @@ Introducing non-proportional hazards
And the diagnostic test of proportionality confirms the
appropriateness of the model:
-cox.zph(coxfit2)
-## chisq df p
-## x:strata(tgroup) 2.44 2 0.3
-## GLOBAL 2.44 2 0.3
+
+## chisq df p
+## x:strata(tgroup) 0.57 2 0.75
+## GLOBAL 0.57 2 0.75
Generating parameters for survival distribution
@@ -2318,20 +2318,20 @@ Generating parameters for survival distribution
example, if we would like to find the parameters for a distribution
where 80% survive until day 100, and 10% survive until day 200 (any
number of points may be provided):
-<- list(c(100, 0.80), c(200, 0.10))
- points <- survGetParams(points)
- r r
+
## [1] -17.007 0.297
We can visualize the curve that is defined by these parameters:
-survParamPlot(f = r[1], shape = r[2], points)
+
And we can generate data based on these parameters:
-<- defSurv(varname = "death", formula = -17, scale = 1, shape = 0.3)
- defS <- defSurv(defS, varname = "censor", formula = 0, scale = exp(18.5), shape = 0.3)
- defS
-<- genData(500)
- dd <- genSurv(dd, defS, timeName = "time", censorName = "censor") dd
-
+
+
diff --git a/inst/doc/treat_and_exposure.R b/inst/doc/treat_and_exposure.R
index b22ba46..db7ee6e 100644
--- a/inst/doc/treat_and_exposure.R
+++ b/inst/doc/treat_and_exposure.R
@@ -1,4 +1,7 @@
-## ---- echo = FALSE, message=FALSE---------------------------------------------
+## ----chunkname, echo=-1-------------------------------------------------------
+data.table::setDTthreads(2)
+
+## ----echo = FALSE, message=FALSE----------------------------------------------
library(simstudy)
library(ggplot2)
@@ -66,18 +69,18 @@ def <- defData(def, varname = "baseDBP", dist = "normal",
dtstudy <- genData(330, def)
-## ---- tidy = TRUE-------------------------------------------------------------
+## ----tidy = TRUE--------------------------------------------------------------
study1 <- trtAssign(dtstudy , n=3, balanced = TRUE, strata = c("male","over65"), grpName = "rxGrp")
study1
-## ---- tidy = TRUE-------------------------------------------------------------
+## ----tidy = TRUE--------------------------------------------------------------
study2 <- trtAssign(dtstudy , n=3, balanced = TRUE, grpName = "rxGrp")
-## ---- tidy = TRUE-------------------------------------------------------------
+## ----tidy = TRUE--------------------------------------------------------------
study3 <- trtAssign(dtstudy , n=3, balanced = FALSE, grpName = "rxGrp")
-## ---- tidy = TRUE, echo = FALSE, fig.width = 4, fig.height = 6----------------
+## ----tidy = TRUE, echo = FALSE, fig.width = 4, fig.height = 6-----------------
p1 <- splotfunc(study1, "Balanced within strata")
p1a <- aplotfunc(study1, "")
@@ -105,11 +108,11 @@ dtstudy
## -----------------------------------------------------------------------------
dtstudy[, .(n = .N, avg = round(mean(y), 1)), keyby = .(male, over65, rx)]
-## ---- tidy = TRUE-------------------------------------------------------------
+## ----tidy = TRUE--------------------------------------------------------------
formula1 <- c("-2 + 2*male - .5*over65", "-1 + 2*male + .5*over65")
dtExp <- trtObserve(dtstudy, formulas = formula1, logit.link = TRUE, grpName = "exposure")
-## ---- tidy = TRUE, echo = FALSE, fig.width = 6.5, fig.height = 2.5------------
+## ----tidy = TRUE, echo = FALSE, fig.width = 6.5, fig.height = 2.5-------------
dtplot1 <- dtExp[,.N,keyby=.(male,exposure)]
p1 <- ggplot(data = dtplot1, aes(x=factor(male), y=N)) +
geom_bar(aes(fill=factor(exposure)), alpha = .8, stat="identity", position = "dodge") +
@@ -137,12 +140,12 @@ p2 <- ggplot(data = dtplot2, aes(x=factor(over65), y=N)) +
grid.arrange(p1,p2,nrow=1)
-## ---- tidy = TRUE-------------------------------------------------------------
+## ----tidy = TRUE--------------------------------------------------------------
formula2 <- c(.35, .45)
dtExp2 <- trtObserve(dtstudy, formulas = formula2, logit.link = FALSE, grpName = "exposure")
-## ---- tidy = TRUE, echo = FALSE, fig.width = 6.5, fig.height = 2.5------------
+## ----tidy = TRUE, echo = FALSE, fig.width = 6.5, fig.height = 2.5-------------
dtplot1a <- dtExp2[,.N,keyby=.(male,exposure)]
p1a <- ggplot(data = dtplot1a, aes(x=factor(male), y=N)) +
geom_bar(aes(fill=factor(exposure)), alpha = .8, stat="identity", position = "dodge") +
diff --git a/inst/doc/treat_and_exposure.Rmd b/inst/doc/treat_and_exposure.Rmd
index 3420bf9..f19b3e0 100644
--- a/inst/doc/treat_and_exposure.Rmd
+++ b/inst/doc/treat_and_exposure.Rmd
@@ -7,6 +7,10 @@ vignette: >
\usepackage[utf8]{inputenc}
---
+```{r chunkname, echo=-1}
+data.table::setDTthreads(2)
+```
+
```{r, echo = FALSE, message=FALSE}
library(simstudy)
diff --git a/inst/doc/treat_and_exposure.html b/inst/doc/treat_and_exposure.html
index 3b7efb6..0fb95e1 100644
--- a/inst/doc/treat_and_exposure.html
+++ b/inst/doc/treat_and_exposure.html
@@ -355,37 +355,37 @@ Assigned treatment
across treatment groups is not guaranteed, particularly with small
sample sizes.
First, create the data definition:
-<- defData(varname = "male", dist = "binary",
- def formula = .5 , id="cid")
- <- defData(def, varname = "over65", dist = "binary",
- def formula = "-1.7 + .8*male", link="logit")
- <- defData(def, varname = "baseDBP", dist = "normal",
- def formula = 70, variance = 40)
-
-<- genData(330, def) dtstudy
+def <- defData(varname = "male", dist = "binary",
+ formula = .5 , id="cid")
+def <- defData(def, varname = "over65", dist = "binary",
+ formula = "-1.7 + .8*male", link="logit")
+def <- defData(def, varname = "baseDBP", dist = "normal",
+ formula = 70, variance = 40)
+
+dtstudy <- genData(330, def)
Balanced treatment assignment, stratified by gender and age
category (not blood pressure)
-<- trtAssign(dtstudy, n = 3, balanced = TRUE, strata = c("male", "over65"),
- study1 grpName = "rxGrp")
-
- study1
+study1 <- trtAssign(dtstudy, n = 3, balanced = TRUE, strata = c("male", "over65"),
+ grpName = "rxGrp")
+
+study1
## cid male over65 baseDBP rxGrp
-## 1: 1 1 0 74.6 1
-## 2: 2 0 0 66.9 2
-## 3: 3 1 1 64.8 2
-## 4: 4 0 0 63.8 2
-## 5: 5 1 0 55.2 3
+## 1: 1 1 1 64.9 2
+## 2: 2 1 0 67.0 2
+## 3: 3 0 1 75.0 3
+## 4: 4 0 0 70.9 2
+## 5: 5 1 0 68.6 1
## ---
-## 326: 326 0 0 68.5 3
-## 327: 327 0 0 57.5 3
-## 328: 328 0 0 70.8 3
-## 329: 329 1 0 69.4 2
-## 330: 330 0 0 67.9 2
+## 326: 326 1 0 71.0 3
+## 327: 327 0 0 64.1 2
+## 328: 328 1 0 65.4 3
+## 329: 329 1 0 69.1 3
+## 330: 330 0 0 75.9 2
Balanced treatment assignment (without stratification)
-<- trtAssign(dtstudy, n = 3, balanced = TRUE, grpName = "rxGrp") study2
+
Random (unbalanced) treatment assignment
-<- trtAssign(dtstudy, n = 3, balanced = FALSE, grpName = "rxGrp") study3
-Comparison of three treatment assignment mechanisms
+
+Comparison of three treatment assignment mechanisms
Assigned treatment using trtAssign distribution in
@@ -395,41 +395,41 @@ Assigned treatment using trtAssign distribution in
randomization is stratified by gender and age, and the
outcome y is effected by both of these factors as well as the
treatment assignment variable rx.
-<- defData(varname = "male", dist = "binary",
- def formula = .5 , id="cid")
- <- defData(def, varname = "over65", dist = "binary",
- def formula = "-1.7 + .8*male", link="logit")
- <- defData(def, varname = "rx", dist = "trtAssign",
- def formula = "1;1", variance = "male;over65")
- <- defData(def, varname = "y", dist = "normal",
- def formula = "20 + 5*male + 10*over65 + 10*rx", variance = 40)
-
-<- genData(330, def)
- dtstudy dtstudy
+def <- defData(varname = "male", dist = "binary",
+ formula = .5 , id="cid")
+def <- defData(def, varname = "over65", dist = "binary",
+ formula = "-1.7 + .8*male", link="logit")
+def <- defData(def, varname = "rx", dist = "trtAssign",
+ formula = "1;1", variance = "male;over65")
+def <- defData(def, varname = "y", dist = "normal",
+ formula = "20 + 5*male + 10*over65 + 10*rx", variance = 40)
+
+dtstudy <- genData(330, def)
+dtstudy
## cid male over65 rx y
-## 1: 1 1 0 0 38.0
-## 2: 2 1 1 0 32.9
-## 3: 3 0 1 0 38.7
-## 4: 4 0 0 1 14.4
-## 5: 5 0 0 1 30.7
+## 1: 1 1 0 0 26.9
+## 2: 2 1 0 1 36.6
+## 3: 3 1 0 1 44.6
+## 4: 4 1 0 1 37.2
+## 5: 5 1 0 0 31.2
## ---
-## 326: 326 1 1 0 31.2
-## 327: 327 1 1 0 36.8
-## 328: 328 1 0 1 38.6
-## 329: 329 1 0 1 33.1
-## 330: 330 0 1 0 21.5
+## 326: 326 0 0 0 28.4
+## 327: 327 0 0 0 25.1
+## 328: 328 1 0 0 29.8
+## 329: 329 0 0 1 28.2
+## 330: 330 1 0 0 22.3
Here are the counts and average outcomes for each gender,
age, and treatment combination:
-n = .N, avg = round(mean(y), 1)), keyby = .(male, over65, rx)] dtstudy[, .(
+
## male over65 rx n avg
-## 1: 0 0 0 65 21.0
-## 2: 0 0 1 66 29.0
-## 3: 0 1 0 11 28.9
-## 4: 0 1 1 12 39.1
-## 5: 1 0 0 59 23.8
-## 6: 1 0 1 59 34.9
-## 7: 1 1 0 29 34.1
-## 8: 1 1 1 29 46.6
+## 1: 0 0 0 70 19.6
+## 2: 0 0 1 70 30.2
+## 3: 0 1 0 12 29.2
+## 4: 0 1 1 11 42.4
+## 5: 1 0 0 59 24.2
+## 6: 1 0 1 58 35.1
+## 7: 1 1 0 25 35.3
+## 8: 1 1 1 25 46.0
Observed treatment
@@ -439,19 +439,19 @@ Observed treatment
exposure to a specific level can depend on covariates already in the
data set. In this case, there are three exposure groups that vary by
gender and age:
-<- c("-2 + 2*male - .5*over65", "-1 + 2*male + .5*over65")
- formula1 <- trtObserve(dtstudy, formulas = formula1, logit.link = TRUE, grpName = "exposure") dtExp
+formula1 <- c("-2 + 2*male - .5*over65", "-1 + 2*male + .5*over65")
+dtExp <- trtObserve(dtstudy, formulas = formula1, logit.link = TRUE, grpName = "exposure")
Here are the exposure distributions by gender and age:
-
+
Here is a second case of three exposures where the exposure is
independent of any covariates. Note that specifying the formula as
c(.35, .45)
is the same as specifying it is
c(.35, .45, .20)
. Also, when referring to probabilities,
the identity link is used:
-<- c(0.35, 0.45)
- formula2
-<- trtObserve(dtstudy, formulas = formula2, logit.link = FALSE, grpName = "exposure") dtExp2
-
+
+
Stepped-wedge design
@@ -474,16 +474,16 @@ Stepped-wedge design
We need to define the cluster-level variables (i.e. the cluster
effect and the cluster size) as well as the individual specific outcome.
In this case each cluster will have 15 individuals per period, and \(\sigma^2_b = 0.20\). In addition, \(\sigma^2_e = 1.75\).
-library(simstudy)
-library(ggplot2)
-
-<- defData(varname = "ceffect", formula = 0, variance = 0.20,
- defc dist = "normal", id = "cluster")
- <- defData(defc, "m", formula = 15, dist = "nonrandom")
- defc
-<- defDataAdd(varname = "Y",
- defa formula = "0 + ceffect + 0.1*period + trt*1.5",
- variance = 1.75, dist = "normal")
+library(simstudy)
+library(ggplot2)
+
+defc <- defData(varname = "ceffect", formula = 0, variance = 0.20,
+ dist = "normal", id = "cluster")
+defc <- defData(defc, "m", formula = 15, dist = "nonrandom")
+
+defa <- defDataAdd(varname = "Y",
+ formula = "0 + ceffect + 0.1*period + trt*1.5",
+ variance = 1.75, dist = "normal")
In this case, there will be 30 clusters and 24 time periods. With 15
individuals per cluster per period, there will be 360 observations for
each cluster, and 10,800 in total. (There is no reason the cluster sizes
@@ -504,17 +504,17 @@
Stepped-wedge design
8, 12, 16, and 20.
Once the treatment assignments are made, the individual records are
created and the outcome data are generated in the last step.
-set.seed(608477)
-
-<- genData(30, defc)
- dc <- addPeriods(dc, 24, "cluster", timevarName = "t")
- dp <- trtStepWedge(dp, "cluster", nWaves = 5, lenWaves = 4,
- dp startPer = 4, grpName = "trt")
-
-<- genCluster(dp, cLevelVar = "timeID", "m", "id")
- dd <- addColumns(defa, dd)
- dd
- dd
+set.seed(608477)
+
+dc <- genData(30, defc)
+dp <- addPeriods(dc, 24, "cluster", timevarName = "t")
+dp <- trtStepWedge(dp, "cluster", nWaves = 5, lenWaves = 4,
+ startPer = 4, grpName = "trt")
+
+dd <- genCluster(dp, cLevelVar = "timeID", "m", "id")
+dd <- addColumns(defa, dd)
+
+dd
## cluster period ceffect m timeID startTrt trt id Y
## 1: 1 0 0.6278 15 1 4 0 1 1.524
## 2: 1 0 0.6278 15 1 4 0 2 0.986
@@ -527,16 +527,16 @@ Stepped-wedge design
## 10798: 30 23 -0.0983 15 720 20 1 10798 4.118
## 10799: 30 23 -0.0983 15 720 20 1 10799 4.569
## 10800: 30 23 -0.0983 15 720 20 1 10800 3.656
-<- dd[, .(Y = mean(Y)), keyby = .(cluster, period, trt, startTrt)]
- dSum
-ggplot(data = dSum,
-aes(x = period, y = Y, group = interaction(cluster, trt))) +
- geom_line(aes(color = factor(trt))) +
- facet_grid(factor(startTrt, labels = c(1 : 5)) ~ .) +
- scale_x_continuous(breaks = seq(0, 23, by = 4), name = "week") +
- scale_color_manual(values = c("#b8cce4", "#4e81ba")) +
- theme(panel.grid = element_blank(),
- legend.position = "none")
+dSum <- dd[, .(Y = mean(Y)), keyby = .(cluster, period, trt, startTrt)]
+
+ggplot(data = dSum,
+ aes(x = period, y = Y, group = interaction(cluster, trt))) +
+ geom_line(aes(color = factor(trt))) +
+ facet_grid(factor(startTrt, labels = c(1 : 5)) ~ .) +
+ scale_x_continuous(breaks = seq(0, 23, by = 4), name = "week") +
+ scale_color_manual(values = c("#b8cce4", "#4e81ba")) +
+ theme(panel.grid = element_blank(),
+ legend.position = "none")
diff --git a/man/addCompRisk.Rd b/man/addCompRisk.Rd
index 5c7f69a..219672c 100644
--- a/man/addCompRisk.Rd
+++ b/man/addCompRisk.Rd
@@ -2,7 +2,7 @@
% Please edit documentation in R/utility.R
\name{addCompRisk}
\alias{addCompRisk}
-\title{Generating single competing risk survival varible}
+\title{Generating single competing risk survival variable}
\usage{
addCompRisk(
dtName,
@@ -46,7 +46,7 @@ to FALSE.}
An updated data table
}
\description{
-Generating single competing risk survival varible
+Generating single competing risk survival variable
}
\examples{
d1 <- defData(varname = "x1", formula = .5, dist = "binary")
diff --git a/man/addCorGen.Rd b/man/addCorGen.Rd
index 4b6d557..512823b 100644
--- a/man/addCorGen.Rd
+++ b/man/addCorGen.Rd
@@ -86,7 +86,6 @@ addCorGen(
# Long example
def <- defData(varname = "xbase", formula = 5, variance = .4, dist = "gamma", id = "cid")
-def <- defData(def, "nperiods", formula = 3, dist = "noZeroPoisson")
def2 <- defDataAdd(
varname = "p", formula = "-3+.2*period + .3*xbase",
@@ -98,6 +97,11 @@ dt <- genData(100, def)
dtLong <- addPeriods(dt, idvars = "cid", nPeriods = 3)
dtLong <- addColumns(def2, dtLong)
+addCorGen(
+ dtOld = dtLong, idvar = "cid", nvars = NULL, rho = .7, corstr = "cs",
+ dist = "binary", param1 = "p"
+)
+
}
\references{
Emrich LJ, Piedmonte MR. A Method for Generating High-Dimensional
diff --git a/man/genCorFlex.Rd b/man/genCorFlex.Rd
index 0684b1d..6f04606 100644
--- a/man/genCorFlex.Rd
+++ b/man/genCorFlex.Rd
@@ -35,6 +35,7 @@ data.table with added column(s) of correlated data
Create multivariate (correlated) data - for general distributions
}
\examples{
+\dontrun{
def <- defData(varname = "xNorm", formula = 0, variance = 4, dist = "normal")
def <- defData(def, varname = "xGamma1", formula = 15, variance = 2, dist = "gamma")
def <- defData(def, varname = "xBin", formula = 0.5, dist = "binary")
@@ -52,4 +53,5 @@ cor(dt[, -"id"], method = "kendall")
var(dt[, -"id"])
apply(dt[, -"id"], 2, mean)
}
+}
\concept{correlated}
diff --git a/man/logisticCoefs.Rd b/man/logisticCoefs.Rd
new file mode 100644
index 0000000..ef031da
--- /dev/null
+++ b/man/logisticCoefs.Rd
@@ -0,0 +1,86 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/utility.R
+\name{logisticCoefs}
+\alias{logisticCoefs}
+\title{Determine intercept, treatment/exposure and covariate coefficients that can
+be used for binary data generation with a logit link and a set of covariates}
+\usage{
+logisticCoefs(
+ defCovar,
+ coefs,
+ popPrev,
+ rr = NULL,
+ rd = NULL,
+ auc = NULL,
+ tolerance = 0.001,
+ sampleSize = 1e+05,
+ trtName = "A"
+)
+}
+\arguments{
+\item{defCovar}{A definition table for the covariates in the underlying
+population. This tables specifies the distribution of the covariates.}
+
+\item{coefs}{A vector of coefficients that reflect the relationship between
+each of the covariates and the log-odds of the outcome.}
+
+\item{popPrev}{The target population prevalence of the outcome.
+A value between 0 and 1.}
+
+\item{rr}{The target risk ratio, which must be a value between 0 and
+1/popPrev. Defaults to NULL.}
+
+\item{rd}{The target risk difference, which must be between
+-(popPrev) and (1 - popPrev). Defaults to NULL}
+
+\item{auc}{The target AUC, which must be a value between 0.5 and 1.0 .
+Defaults to NULL.}
+
+\item{tolerance}{The minimum stopping distance between the adjusted low and high
+endpoints. Defaults to 0.001.}
+
+\item{sampleSize}{The number of units to generate for the bisection algorithm.
+The default is 1e+05. To get a reliable estimate, the value
+should be no smaller than the default, though larger values can be used, though
+computing time will increase.}
+
+\item{trtName}{If either a risk ratio or risk difference is the target statistic,
+a treatment/exposure variable name can be provided. Defaults to "A".}
+}
+\value{
+A vector of parameters including the intercept and covariate
+coefficients for the logistic model data generating process.
+}
+\description{
+This is an implementation of an iterative bisection procedure
+that can be used to determine coefficient values for a target population
+prevalence as well as a target risk ratio, risk difference, or AUC. These
+coefficients can be used in a subsequent data generation process to simulate
+data with these desire characteristics.
+}
+\details{
+If no specific target statistic is specified, then only the intercept
+is returned along with the original coefficients. Only one target statistic (risk ratio, risk
+difference or AUC) can be specified with a single function call; in all three cases, a target
+prevalence is still required.
+}
+\examples{
+\dontrun{
+d1 <- defData(varname = "x1", formula = 0, variance = 1)
+d1 <- defData(d1, varname = "b1", formula = 0.5, dist = "binary")
+
+coefs <- log(c(1.2, 0.8))
+
+logisticCoefs(d1, coefs, popPrev = 0.20)
+logisticCoefs(d1, coefs, popPrev = 0.20, rr = 1.50, trtName = "rx")
+logisticCoefs(d1, coefs, popPrev = 0.20, rd = 0.30, trtName = "rx")
+logisticCoefs(d1, coefs, popPrev = 0.20, auc = 0.80)
+}
+}
+\references{
+Austin, Peter C. "The iterative bisection procedure: a useful
+tool for determining parameter values in data-generating processes in
+Monte Carlo simulations." BMC Medical Research Methodology 23,
+no. 1 (2023): 1-10.
+}
+\concept{utility}
diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp
index 9d0e480..945ca99 100644
--- a/src/RcppExports.cpp
+++ b/src/RcppExports.cpp
@@ -97,6 +97,46 @@ BEGIN_RCPP
return rcpp_result_gen;
END_RCPP
}
+// getBeta0
+double getBeta0(NumericVector lvec, double popPrev, double tolerance);
+RcppExport SEXP _simstudy_getBeta0(SEXP lvecSEXP, SEXP popPrevSEXP, SEXP toleranceSEXP) {
+BEGIN_RCPP
+ Rcpp::RObject rcpp_result_gen;
+ Rcpp::RNGScope rcpp_rngScope_gen;
+ Rcpp::traits::input_parameter< NumericVector >::type lvec(lvecSEXP);
+ Rcpp::traits::input_parameter< double >::type popPrev(popPrevSEXP);
+ Rcpp::traits::input_parameter< double >::type tolerance(toleranceSEXP);
+ rcpp_result_gen = Rcpp::wrap(getBeta0(lvec, popPrev, tolerance));
+ return rcpp_result_gen;
+END_RCPP
+}
+// estAUC
+double estAUC(NumericMatrix dmatrix, NumericVector y);
+RcppExport SEXP _simstudy_estAUC(SEXP dmatrixSEXP, SEXP ySEXP) {
+BEGIN_RCPP
+ Rcpp::RObject rcpp_result_gen;
+ Rcpp::RNGScope rcpp_rngScope_gen;
+ Rcpp::traits::input_parameter< NumericMatrix >::type dmatrix(dmatrixSEXP);
+ Rcpp::traits::input_parameter< NumericVector >::type y(ySEXP);
+ rcpp_result_gen = Rcpp::wrap(estAUC(dmatrix, y));
+ return rcpp_result_gen;
+END_RCPP
+}
+// getBeta_auc
+NumericVector getBeta_auc(NumericMatrix covmat, NumericVector coefs, double auc, double popPrev, double tolerance);
+RcppExport SEXP _simstudy_getBeta_auc(SEXP covmatSEXP, SEXP coefsSEXP, SEXP aucSEXP, SEXP popPrevSEXP, SEXP toleranceSEXP) {
+BEGIN_RCPP
+ Rcpp::RObject rcpp_result_gen;
+ Rcpp::RNGScope rcpp_rngScope_gen;
+ Rcpp::traits::input_parameter< NumericMatrix >::type covmat(covmatSEXP);
+ Rcpp::traits::input_parameter< NumericVector >::type coefs(coefsSEXP);
+ Rcpp::traits::input_parameter< double >::type auc(aucSEXP);
+ Rcpp::traits::input_parameter< double >::type popPrev(popPrevSEXP);
+ Rcpp::traits::input_parameter< double >::type tolerance(toleranceSEXP);
+ rcpp_result_gen = Rcpp::wrap(getBeta_auc(covmat, coefs, auc, popPrev, tolerance));
+ return rcpp_result_gen;
+END_RCPP
+}
static const R_CallMethodDef CallEntries[] = {
{"_simstudy_matMultinom", (DL_FUNC) &_simstudy_matMultinom, 1},
@@ -106,6 +146,9 @@ static const R_CallMethodDef CallEntries[] = {
{"_simstudy_checkBoundsBin", (DL_FUNC) &_simstudy_checkBoundsBin, 3},
{"_simstudy_findRhoBin", (DL_FUNC) &_simstudy_findRhoBin, 3},
{"_simstudy_getRhoMat", (DL_FUNC) &_simstudy_getRhoMat, 3},
+ {"_simstudy_getBeta0", (DL_FUNC) &_simstudy_getBeta0, 3},
+ {"_simstudy_estAUC", (DL_FUNC) &_simstudy_estAUC, 2},
+ {"_simstudy_getBeta_auc", (DL_FUNC) &_simstudy_getBeta_auc, 5},
{NULL, NULL, 0}
};
diff --git a/src/srcRcpp.cpp b/src/srcRcpp.cpp
index 44073c6..d234179 100644
--- a/src/srcRcpp.cpp
+++ b/src/srcRcpp.cpp
@@ -1,9 +1,9 @@
#include
+#include
+#include
// [[Rcpp::depends(pbv)]]
-#include
-#include
using namespace std;
using namespace Rcpp;
@@ -206,3 +206,169 @@ NumericMatrix getRhoMat(int N, NumericVector P, NumericMatrix TCORR) {
}
+// [[Rcpp::export]]
+double getBeta0(NumericVector lvec, double popPrev, double tolerance) {
+
+ double intLow = -10;
+ double intHigh = 10;
+ double B0;
+ double PREV;
+ NumericVector ps(lvec.length());
+ NumericVector nvec(lvec.length());
+
+ while(abs(intHigh - intLow) > tolerance){
+
+ B0 = (intLow + intHigh)/2;
+ for (int i = 0; i < lvec.length(); i++) {
+ nvec(i) = lvec(i) + B0;
+ }
+ ps = Rcpp::plogis(nvec);
+ PREV = mean(ps);
+
+ if (PREV < popPrev) {
+ intLow = B0;
+ } else {
+ intHigh = B0;
+ }
+
+ }
+
+ B0 = (intLow + intHigh)/2;
+
+ return(B0);
+
+}
+
+
+
+// [[Rcpp::export]]
+double estAUC(NumericMatrix dmatrix, NumericVector y) {
+
+ int aucN = 1000000;
+
+ int N0 = y.length() - sum(y);
+ int N1 = sum(y);
+
+ NumericVector P1(N1);
+ NumericVector P0(N0);
+
+ // Obtaining namespace of fastglm package
+ Environment pkg = Environment::namespace_env("fastglm");
+ Function f = pkg["fastglm"];
+ Function p = pkg["predict.fastglm"];
+
+ // model should be include "Named("family", "binomial")", but occasionally
+ // throws off warning. Using Gaussian works very slightly less well, but
+ // there are no warnings
+
+ List fit = f(Named("x", dmatrix), Named("y", y));
+ NumericVector pred = p(Named("object", fit), Named("newdata", dmatrix));
+
+ int i1 = 0;
+ int i0 = 0;
+
+ for (int i = 0; i < y.length(); i++) {
+ if ( y(i) == 1) {
+ P1[i1] = pred(i);
+ i1++;
+ } else {
+ P0[i0] = pred(i);
+ i0++;
+ }
+ }
+
+ NumericVector s1 = P1[floor(Rcpp::runif(aucN, 0, N1))];
+ NumericVector s0 = P0[floor(Rcpp::runif(aucN, 0, N0))];
+
+ double n_greater = 0;
+
+ for (int i = 0; i < aucN; i++) {
+ if (s1(i) > s0(i)) n_greater++;
+ }
+
+ double AUC = n_greater/aucN;
+
+ return(AUC);
+
+}
+
+// [[Rcpp::export]]
+NumericVector getBeta_auc(NumericMatrix covmat, NumericVector coefs, double auc,
+ double popPrev, double tolerance) {
+
+ Environment base("package:base");
+ Function mat_Mult = base["%*%"];
+
+ int N = covmat.nrow();
+
+ double intLow = 0;
+ double intHigh = 40;
+
+ double alpha;
+ double B0;
+ double aStat;
+ NumericVector ps(N);
+ NumericVector lvec(N);
+ NumericVector avec(coefs.length());
+ NumericVector avecint(coefs.length() + 1);
+
+ NumericVector onevec(N);
+ NumericVector y(N);
+
+ NumericVector results(2);
+
+ for (int i = 0; i < N; i++) {
+ onevec(i) = 1;
+ }
+
+ NumericMatrix dmatrix = cbind(onevec, covmat);
+
+ lvec = mat_Mult(covmat, coefs);
+ B0 = getBeta0(lvec, popPrev, tolerance);
+ avecint(0) = B0;
+
+ aStat = 0;
+
+ while(abs(aStat - auc) > tolerance) {
+
+ alpha = (intLow + intHigh)/2;
+
+ for (int i = 0; i < coefs.length(); i++) {
+ avecint(i+1) = alpha * coefs(i);
+ }
+
+ lvec = mat_Mult(dmatrix, avecint);
+ ps = Rcpp::plogis(lvec);
+
+ for (int i=0; i(rbinom(1, 1, ps(i)));
+ }
+
+ aStat = estAUC(dmatrix, y);
+
+ // Rcout << auc << " " << alpha << " " << aStat << "\n";
+
+ if (aStat < auc) {
+ intLow = alpha;
+ } else {
+ intHigh = alpha;
+ }
+
+ }
+
+ alpha = (intLow + intHigh)/2;
+
+ for (int i = 0; i < coefs.length(); i++) {
+ avec(i) = alpha * coefs(i);
+ }
+
+ lvec = mat_Mult(covmat, avec);
+ B0 = getBeta0(lvec, popPrev, tolerance);
+
+ results(0) = B0;
+ results(1) = alpha;
+
+ return(results);
+
+}
+
diff --git a/tests/testthat.R b/tests/testthat.R
index db8aa28..eb06433 100644
--- a/tests/testthat.R
+++ b/tests/testthat.R
@@ -2,5 +2,5 @@ library(testthat)
library(hedgehog)
library(simstudy)
-
+data.table::setDTthreads(2) # added to solve CRAN issue
test_check("simstudy")
diff --git a/tests/testthat/test-add_data.R b/tests/testthat/test-add_data.R
index d25af7d..c96b58a 100644
--- a/tests/testthat/test-add_data.R
+++ b/tests/testthat/test-add_data.R
@@ -1,11 +1,13 @@
# addCondition ----
test_that("addCondition throws errors.", {
+ skip_on_cran()
expect_error(addCondition(), class = "simstudy::missingArgument")
expect_error(addCondition("a"), class = "simstudy::missingArgument")
expect_error(addCondition(data.frame(), data.frame(), "a"), class = "simstudy::wrongClass")
})
test_that("addCondition works.", {
+ skip_on_cran()
def <- defData(varname = "x", formula = "1;10", dist = "uniformInt")
defC <- defCondition(condition = "x >= 5", formula = "x + 5", dist = "nonrandom")
defC <- defCondition(defC, condition = "x < 5", formula = "10", dist = "nonrandom")
@@ -20,12 +22,14 @@ test_that("addCondition works.", {
# addColumns ----
test_that("addColumns throws errors.", {
+ skip_on_cran()
expect_error(addColumns(), class = "simstudy::missingArgument")
expect_error(addColumns("a"), class = "simstudy::missingArgument")
expect_error(addColumns(data.frame(), data.frame()), class = "simstudy::wrongClass")
})
test_that("addColumns works.", {
+ skip_on_cran()
def <- defData(varname = "x", formula = "1;10", dist = "uniformInt")
dt <- genData(100, def)
def2 <- defDataAdd(varname = "y", formula = "2.3 * (1/x)", dist = "normal")
@@ -34,6 +38,7 @@ test_that("addColumns works.", {
})
test_that("defRepeatAdd works", {
+ skip_on_cran()
expect_silent(
defRepeatAdd(nVars = 4, prefix = "g", formula = "1/3;1/3;1/3", variance = 0, dist = "categorical")
)
@@ -47,6 +52,7 @@ test_that("defRepeatAdd works", {
})
test_that("defRepeatAdd throws errors correctly.", {
+ skip_on_cran()
expect_error(defRepeatAdd(prefix = "b", formula = 5, variance = 3, dist = "normal"),
class = "simstudy::missingArgument"
)
@@ -60,6 +66,7 @@ test_that("defRepeatAdd throws errors correctly.", {
# addMarkov ----
test_that("addMarkov throws errors.", {
+ skip_on_cran()
d0 <- defData(varname = "xx", formula = 2)
d0 <- defData(d0, varname = "xy", formula = 5)
dd <- genData(n = 10, dt = d0)
@@ -94,6 +101,7 @@ test_that("addMarkov throws errors.", {
# addSynthetic ----
test_that("addSynthetic throws errors.", {
+ skip_on_cran()
### Create fake "real" data set
@@ -125,6 +133,7 @@ test_that("addSynthetic throws errors.", {
})
test_that("addSynthetic works.", {
+ skip_on_cran()
### Create fake 'external' data set 'A'
diff --git a/tests/testthat/test-conditions.R b/tests/testthat/test-conditions.R
index d969f76..91c1ca9 100644
--- a/tests/testthat/test-conditions.R
+++ b/tests/testthat/test-conditions.R
@@ -1,4 +1,5 @@
test_that("conditions have correct class.", {
+ skip_on_cran()
expect_error(stop(condition(c("error", "custom_Error"), "This is a custom error")),
class = c("error", "custom_Error")
)
@@ -11,6 +12,7 @@ test_that("conditions have correct class.", {
})
test_that("pluralization works.", {
+ skip_on_cran()
expect_error(argMissingError("arg1"), "argument is missing",
class = "simstudy::missingArgument"
)
diff --git a/tests/testthat/test-define_data.R b/tests/testthat/test-define_data.R
index 088dc71..717c819 100644
--- a/tests/testthat/test-define_data.R
+++ b/tests/testthat/test-define_data.R
@@ -5,6 +5,8 @@ freeze_eval <- names(.GlobalEnv)
# defData ----
test_that("defData throws errors", {
+ skip_on_cran()
+
expect_error(defData(dist = "nonrandom", formula = 7, id = "idnum"), class = "simstudy::missingArgument")
expect_error(defData(varname = "xnr", dist = "nonrandom", id = "idnum"), class = "simstudy::missingArgument")
})
@@ -13,30 +15,32 @@ test_that("defData throws errors", {
# .evalDef ----
-test_that("checks combine in .evalDef correctly", {
- skip_on_cran()
-
- # this generates 20 previously defined varnames.
- gen_defVars <- gen.and_then(gen.int(20), gen_varnames)
-
- gen_evalDef_call <-
- gen.and_then(gen_defVars, function(defVars) {
- generate(for (i in gen_dist) {
- list(
- newvar = defVars[1],
- newform = get(reg[name == i]$formula)(defVars[-1]),
- newdist = i,
- variance = get(reg[name == i]$variance)(defVars[-1]),
- link = get(reg[name == i]$link),
- defVars = defVars[-1]
- )
- })
- })
-
- forall(gen_evalDef_call, function(args) expect_silent(do.call(.evalDef, args)))
-})
+# test_that("checks combine in .evalDef correctly", {
+# skip_on_cran()
+#
+# # this generates 20 previously defined varnames.
+# gen_defVars <- gen.and_then(gen.int(20), gen_varnames)
+#
+# gen_evalDef_call <-
+# gen.and_then(gen_defVars, function(defVars) {
+# generate(for (i in gen_dist) {
+# list(
+# newvar = defVars[1],
+# newform = get(reg[name == i]$formula)(defVars[-1]),
+# newdist = i,
+# variance = get(reg[name == i]$variance)(defVars[-1]),
+# link = get(reg[name == i]$link),
+# defVars = defVars[-1]
+# )
+# })
+# })
+#
+# forall(gen_evalDef_call, function(args) expect_silent(do.call(.evalDef, args)))
+# })
test_that(".evalDef throws errors correctly.", {
+ skip_on_cran()
+
expect_error(.evalDef(newvar = 1, "1 + 2", "normal", 0, "identiy", ""), class = "simstudy::wrongType")
expect_error(.evalDef(newvar = c("a", "b"), "1 + 2", "normal", 0, "identiy", ""), class = "simstudy::lengthMismatch")
expect_error(.evalDef(newvar = "varname", "1 + 2", "not valid", 0, "identiy", ""), class = "simstudy::optionInvalid")
@@ -47,23 +51,26 @@ test_that(".evalDef throws errors correctly.", {
})
# .isValidArithmeticFormula ----
-test_that("g.a.e. formula checked correctly.", {
- skip_on_cran()
- gen_gae <-
- gen.and_then(gen_varnames(8), function(ns) {
- gen.map(function(y) {
- list(
- defVars = ns, formula = y
- )
- }, gen_formula(ns))
- })
-
- forall(gen_gae, function(x) {
- expect_silent(.isValidArithmeticFormula(x$formula, x$defVars))
- })
-})
+# test_that("g.a.e. formula checked correctly.", {
+# skip_on_cran()
+#
+# gen_gae <-
+# gen.and_then(gen_varnames(8), function(ns) {
+# gen.map(function(y) {
+# list(
+# defVars = ns, formula = y
+# )
+# }, gen_formula(ns))
+# })
+#
+# forall(gen_gae, function(x) {
+# expect_silent(.isValidArithmeticFormula(x$formula, x$defVars))
+# })
+# })
test_that(".isValidArithmeticFormula throws errors correctly.", {
+ skip_on_cran()
+
expect_error(.isValidArithmeticFormula(""), class = "simstudy::noValue")
expect_error(.isValidArithmeticFormula("a;3"), class = "simstudy::valueError")
expect_error(.isValidArithmeticFormula("1+3-"), class = "simstudy::valueError")
@@ -74,6 +81,7 @@ test_that(".isValidArithmeticFormula throws errors correctly.", {
# .checkMixture ----
test_that("'mixture' formula checked correctly", {
skip_on_cran()
+
gen_mix_vars <- gen.choice(gen_dotdot_num, gen_varname, gen.element(-100:100))
gen_mix_vForm <- gen.sized(function(n) {
gen.and_then(gen.c(gen_mix_vars, of = n), function(p) {
@@ -89,6 +97,8 @@ test_that("'mixture' formula checked correctly", {
})
test_that(".checkMixture throws errors.", {
+ skip_on_cran()
+
expect_error(.checkMixture("nr | .5 + a "), "same amount")
expect_error(.checkMixture("nr | be"), "Probabilities can only be")
})
@@ -96,34 +106,40 @@ test_that(".checkMixture throws errors.", {
# .checkCategorical ----
test_that("'categorical' formula checked correctly", {
skip_on_cran()
+
forall(gen_cat_probs, function(f) {
expect_silent(.checkCategorical(genCatFormula(f)))
})
})
test_that(".checkCategorical throws errors.", {
+ skip_on_cran()
+
expect_error(.checkCategorical("1"), "two numeric")
})
# .checkUniform ----
-test_that("'uniform' formula checked correctly", {
- skip_on_cran()
- forall(
- gen.and_then(gen_varnames(10), function(names) {
- generate(for (x in list(
- min = gen_formula(names),
- max = gen_formula(names)
- )) {
- paste0(x$min, ";", x$max)
- })
- }),
- function(r) {
- expect_silent(.checkUniform(r))
- }
- )
-})
+# test_that("'uniform' formula checked correctly", {
+# skip_on_cran()
+#
+# forall(
+# gen.and_then(gen_varnames(10), function(names) {
+# generate(for (x in list(
+# min = gen_formula(names),
+# max = gen_formula(names)
+# )) {
+# paste0(x$min, ";", x$max)
+# })
+# }),
+# function(r) {
+# expect_silent(.checkUniform(r))
+# }
+# )
+# })
test_that(".checkUniform throws errors.", {
+ skip_on_cran()
+
expect_error(.checkUniform(""), "format")
expect_error(.checkUniform("1;2;3"), "format")
})
@@ -132,6 +148,8 @@ test_that(".checkUniform throws errors.", {
# .isIdLog ----
# .isIdLogit ----
test_that("'link' checked as expected", {
+ skip_on_cran()
+
expect_silent(.isIdLog("identity"))
expect_silent(.isIdLog("log"))
expect_silent(.isIdLogit("identity"))
@@ -148,6 +166,8 @@ test_that("'link' checked as expected", {
# .isDotArr ----
# .splitFormula ----
test_that("utility functions work", {
+ skip_on_cran()
+
names <- c("..as", "..bs", "..cs[4]", "..ds[x]")
res <- c("as", "bs", "cs[4]", "ds[x]")
@@ -161,6 +181,8 @@ test_that("utility functions work", {
})
test_that("defRepeat works.", {
+ skip_on_cran()
+
expect_silent(
defRepeat(nVars = 4, prefix = "g", formula = "1/3;1/3;1/3", variance = 0, dist = "categorical")
)
@@ -172,6 +194,8 @@ test_that("defRepeat works.", {
})
test_that("defRepeat throws errors correctly.", {
+ skip_on_cran()
+
expect_error(defRepeat(prefix = "b", formula = 5, variance = 3, dist = "normal"),
class = "simstudy::missingArgument"
)
diff --git a/tests/testthat/test-glue.R b/tests/testthat/test-glue.R
index 5978295..8ae2088 100644
--- a/tests/testthat/test-glue.R
+++ b/tests/testthat/test-glue.R
@@ -1,4 +1,5 @@
test_that("Blocks are collapsed as expected.", {
+ skip_on_cran()
nums <- 1:3
num <- 23
expect_equal(
@@ -14,6 +15,7 @@ test_that("Blocks are collapsed as expected.", {
})
test_that("numbers are formated as expected.", {
+ skip_on_cran()
nums <- c(1.23, 0.556, 1 / 3)
ints <- c(1, 2, 3)
expect_equal(glueFmt("{nums:.2f}"), as.character(round(nums, 2)))
@@ -22,6 +24,7 @@ test_that("numbers are formated as expected.", {
})
test_that("numbers are collapsed and formated correctly.", {
+ skip_on_cran()
ints <- c(1, 2, 3)
expect_equal(glueFmtC("{ints:02d}"), "01, 02 and 03")
expect_equal(glueFmtC("{2:.1f}"), "2.0")
diff --git a/tests/testthat/test-group_data.R b/tests/testthat/test-group_data.R
index f96e7f4..28a56aa 100644
--- a/tests/testthat/test-group_data.R
+++ b/tests/testthat/test-group_data.R
@@ -1,5 +1,6 @@
# .addStrataCode ----
test_that("strata codes are added as expected.", {
+ skip_on_cran()
def <- defData(varname = "male", dist = "binary", formula = .5, id = "cid")
def <- defData(def, varname = "over65", dist = "binary", formula = "-1.7 + .8*male", link = "logit")
def <- defData(def, varname = "baseDBP", dist = "normal", formula = 70, variance = 40)
@@ -14,6 +15,7 @@ test_that("strata codes are added as expected.", {
# .stratSamp ----
test_that("stratified samples are drawn correctly.", {
+ skip_on_cran()
expect_length(.stratSamp(1, 2), 1)
expect_length(.stratSamp(2, 4), 2)
expect_length(.stratSamp(50, 3), 50)
diff --git a/tests/testthat/test-internal_utility.R b/tests/testthat/test-internal_utility.R
index 75842f5..b7e2747 100644
--- a/tests/testthat/test-internal_utility.R
+++ b/tests/testthat/test-internal_utility.R
@@ -1,5 +1,6 @@
# .parseDotVars ----
test_that("dotVars are parsed correctly.", {
+ skip_on_cran()
extVar1 <- 23
extVar2 <- 42
res <- list(..extVar1 = 23, ..extVar2 = 42)
@@ -12,6 +13,7 @@ test_that("dotVars are parsed correctly.", {
})
test_that("variables from different environments are parsed correctly.", {
+ skip_on_cran()
extVar3 <- 7
env1 <- new.env()
env2 <- new.env(parent = env1)
@@ -27,6 +29,7 @@ test_that("variables from different environments are parsed correctly.", {
# .evalWith ----
test_that("evalWith throws errors.", {
+ skip_on_cran()
df <- data.frame()
ext <- list(formula2parse = 2)
@@ -35,6 +38,7 @@ test_that("evalWith throws errors.", {
})
test_that("evalWith output length is correct.", {
+ skip_on_cran()
df <- data.frame(a = rep.int(5, 5))
ext <- list(..ev = 2)
@@ -43,6 +47,7 @@ test_that("evalWith output length is correct.", {
})
test_that("evalWith output is Matrix.", {
+ skip_on_cran()
df <- data.frame(a = rep.int(5, 5))
ext <- list(..ev = 2)
@@ -74,11 +79,13 @@ test_that("probabilities (matrix) are adjusted as documented.", {
# .getDists ----
test_that("number of Dists is up to date.", {
+ skip_on_cran()
expect_length(.getDists(), 16)
})
# .isFormulaScalar ----
test_that("isFormularScalar works correctly.", {
+ skip_on_cran()
expect_true(.isFormulaScalar("5 + 3"))
expect_true(.isFormulaScalar(5 + 3))
@@ -89,6 +96,7 @@ test_that("isFormularScalar works correctly.", {
# .isValidVarName ----
test_that("var names are validated correctly.", {
+ skip_on_cran()
validNames <- c("var1", "name", "name2", "var1")
wrongNames <- c("...", "..1", "..5")
@@ -103,6 +111,7 @@ test_that("var names are validated correctly.", {
# .isError ----
test_that("errors are detected correctly.", {
+ skip_on_cran()
err <- try(nonVar + 4, silent = TRUE)
noErr <- try(3 + 5, silent = TRUE)
@@ -114,6 +123,7 @@ test_that("errors are detected correctly.", {
# .hasValue ----
test_that("hasValue works.", {
+ skip_on_cran()
expect_true(.hasValue("value"))
expect_true((function(x) .hasValue(x))(5))
expect_true((function(x) .hasValue(x))(NA))
@@ -125,6 +135,7 @@ test_that("hasValue works.", {
# .log2Prob ----
test_that("log odds are converted correctly.", {
+ skip_on_cran()
prob <- 0.2
logOdds <- log(0.25)
diff --git a/tests/testthat/test-missing_data.R b/tests/testthat/test-missing_data.R
index 878a2eb..8637e5a 100644
--- a/tests/testthat/test-missing_data.R
+++ b/tests/testthat/test-missing_data.R
@@ -31,7 +31,7 @@ test_that("genMiss works", {
def1 <- defData(def1, "x2", dist = "normal", formula = "20*m + 20*u", variance = 2)
def1 <- defData(def1, "x3", dist = "normal", formula = "20*m + 20*u", variance = 2)
- dtAct1 <- genData(5000, def1)
+ dtAct1 <- genData(10000, def1)
hardProbForm <- runif(1)
form1val0 <- runif(1)
diff --git a/tests/testthat/test-survival.R b/tests/testthat/test-survival.R
index 11bb162..467e217 100644
--- a/tests/testthat/test-survival.R
+++ b/tests/testthat/test-survival.R
@@ -1,8 +1,10 @@
test_that("defSurv kicks out transition error", {
+ skip_on_cran()
expect_error(defSurv(varname = "censor", formula = "-7", shape = 0.55, transition = 150))
})
test_that("genSurv runs OK", {
+ skip_on_cran()
dS <- defSurv(varname = "event_1", formula = "-10", shape = 0.3)
dS <- defSurv(dS, "event_2", "-6.5", shape = 0.4)
dS <- defSurv(dS, "event_3", "-7", shape = 0.5)
@@ -28,6 +30,7 @@ test_that("genSurv runs OK", {
})
test_that("genSurv throws errors", {
+ skip_on_cran()
dS <- defSurv(varname = "event_1", formula = "-10", shape = 0.3)
dS <- defSurv(dS, "event_2", "-6.5", shape = 0.4)
dS <- defSurv(dS, "event_3", "-7", shape = 0.5)
@@ -38,6 +41,7 @@ test_that("genSurv throws errors", {
})
test_that("addCmpRisk works", {
+ skip_on_cran()
dS <- defSurv(varname = "event_1", formula = "-10", shape = 0.3)
dS <- defSurv(dS, "event_2", "-6.5", shape = 0.4)
diff --git a/tests/testthat/test-utility.R b/tests/testthat/test-utility.R
index faaee48..76ac259 100644
--- a/tests/testthat/test-utility.R
+++ b/tests/testthat/test-utility.R
@@ -44,6 +44,7 @@ test_that("probabilities (vector) are adjusted as documented.", {
})
test_that("genCatFormula throws errors.", {
+ skip_on_cran()
expect_error(genCatFormula(), "Need to specify")
expect_error(genCatFormula(1, 2, 3, n = 5), "or n, not both")
expect_error(genCatFormula(1.1), "must be less than 1")
@@ -53,16 +54,19 @@ test_that("genCatFormula throws errors.", {
# betaGetShapes ----
test_that("betaGetShapes throws errors.", {
+ skip_on_cran()
expect_error(betaGetShapes(1, 12), class = "simstudy::valueError")
expect_error(betaGetShapes(.5, -5), class = "simstudy::valueError")
})
test_that("betaGetShapes works.", {
+ skip_on_cran()
expect_equal(betaGetShapes(.4, 5), list(shape1 = .4 * 5, shape2 = (1 - .4) * 5))
})
# genMixFormula ----
test_that("genMixFormula throws errors.", {
+ skip_on_cran()
expect_error(genMixFormula(), class = "simstudy::missingArgument")
expect_error(genMixFormula("a", varLength = 3), class = "simstudy::valueError")
expect_error(genMixFormula("..a", varLength = "b"), class = "simstudy::wrongType")
@@ -72,6 +76,7 @@ test_that("genMixFormula throws errors.", {
})
test_that("genMixFormula works.", {
+ skip_on_cran()
expect_equal(genMixFormula("a"), "a | 1")
expect_equal(genMixFormula(c("a", "..b"), c(.3, .7)), "a | 0.3 + ..b | 0.7")
expect_equal(
@@ -83,6 +88,7 @@ test_that("genMixFormula works.", {
# survGetParams ----
test_that("survGetParams throws errors.", {
+ skip_on_cran()
expect_error(survGetParams(), class = "simstudy::missingArgument")
expect_error(survGetParams(c(100, .5)), class = "simstudy::wrongClass")
points <- list(c(280, 0.85), c(165, .45))
@@ -96,6 +102,7 @@ test_that("survGetParams throws errors.", {
})
test_that("survGetParam works.", {
+ skip_on_cran()
points <- list(c(50, 0.90), c(100, 0.10))
expect_equal(survGetParams(points), c(-19.658, 0.225), tolerance = .001)
points <- list(c(60, 0.90), c(100, .75), c(200, .25), c(250, .10))
@@ -105,12 +112,14 @@ test_that("survGetParam works.", {
# plotSurv ----
test_that("survParamPlot throws errors.", {
+ skip_on_cran()
expect_error(survParamPlot(), class = "simstudy::missingArgument")
expect_error(survParamPlot(formula = -10), class = "simstudy::missingArgument")
expect_error(survParamPlot(formula = 4, shape = -1), class = "simstudy::wrongSign")
})
test_that("survParamPlot works.", {
+ skip_on_cran()
expect_is(survParamPlot(formula = -4, shape = 1), class = "ggplot")
points <- list(c(100, .8), c(200, .5))
@@ -122,3 +131,105 @@ test_that("survParamPlot works.", {
class = "ggplot"
)
})
+
+# logisticCoefs
+
+# test_that("logisticCoefs works.", {
+#
+# skip_on_cran()
+#
+# d1 <- defData(varname = "x1", formula = 0, variance = 1)
+# d1 <- defData(d1, varname = "b1", formula = 0.5, dist = "binary")
+#
+# coefs <- log(runif(2, min = .8, max = 1.2))
+#
+# ### Prevalence
+#
+# d1a <- defData(d1, varname = "y",
+# formula = "t(..B) %*% c(1, x1, b1)",
+# dist = "binary", link = "logit"
+# )
+#
+# tPop <- round(runif(1, .2, .5), 2)
+# B <- logisticCoefs(defCovar = d1, coefs = coefs, popPrev = tPop)
+#
+# dd <- genData(100000, d1a)
+# expect_equal(dd[, mean(y)], tPop, tolerance = .025)
+#
+# #### Comparisons
+#
+# d1a <- defData(d1, varname = "rx", formula = "1;1", dist = "trtAssign")
+# d1a <- defData(d1a, varname = "y",
+# formula = "t(..B) %*% c(1, rx, x1, b1)",
+# dist = "binary", link = "logit"
+# )
+#
+# ### Risk ratio
+#
+# rr <- runif(1, .1, 1/tPop)
+# B <- logisticCoefs(d1, coefs, popPrev = tPop, rr = rr, trtName = "rx")
+#
+# dd <- genData(100000, d1a)
+# expect_equal(dd[rx==0, mean(y)], tPop, tolerance = .025)
+# expect_equal(dd[rx==1, mean(y)]/dd[rx==0, mean(y)], rr, tolerance = 0.025)
+#
+# ### risk difference
+#
+# rd <- runif(1, -tPop, 1 - tPop)
+# B <- logisticCoefs(d1, coefs, popPrev = tPop, rd = rd, trtName = "rx")
+#
+# dd <- genData(100000, d1a)
+# expect_equal(dd[rx==0, mean(y)], tPop, tolerance = .025)
+# expect_equal(dd[rx==1, mean(y)] - dd[rx==0, mean(y)], rd, tolerance = 0.025)
+#
+# ### AUC
+#
+# d1a <- defData(d1, varname = "y",
+# formula = "t(..B) %*% c(1, x1, b1)",
+# dist = "binary", link = "logit"
+# )
+#
+# auc <- runif(1, 0.6, 0.95)
+# B <- logisticCoefs(d1, coefs, popPrev = tPop, auc = auc)
+#
+# dx <- genData(500000, d1a)
+# expect_equal(dx[, mean(y)], tPop, tolerance = .025)
+#
+# form <- paste("y ~", paste(d1[, varname], collapse = " + "))
+#
+# fit <- stats::glm(stats::as.formula(form), data = dx)
+# dx[, py := stats::predict(fit)]
+#
+# Y1 <- dx[y == 1, sample(py, 1000000, replace = TRUE)]
+# Y0 <- dx[y == 0, sample(py, 1000000, replace = TRUE)]
+# aStat <- mean(Y1 > Y0)
+#
+# expect_equal(aStat, auc, tolerance = 0.025)
+#
+# })
+
+test_that("logisticCoefs throws errors.", {
+
+ skip_on_cran()
+
+ d1 <- defData(varname = "x1", formula = 0, variance = 1)
+ d1 <- defData(d1, varname = "b1", formula = 0.5, dist = "binary")
+
+ coefs <- log(runif(2, min = .8, max = 1.2))
+ coefs2 <- log(runif(1, min = .8, max = 1.2))
+ coefs3 <- c("a", "b")
+
+ expect_error(logisticCoefs(d1, coefs), class = "simstudy::missingArgument")
+ expect_error(logisticCoefs(coef = coefs, popPrev = .5), class = "simstudy::missingArgument")
+ expect_error(logisticCoefs(defCovar = d1, popPrev = .5), class = "simstudy::missingArgument")
+ expect_error(logisticCoefs(d1, coefs, popPrev = .5, rr = 1.1, rd = .4), class = "simpleError")
+ expect_error(logisticCoefs(d1, coefs=coefs2, popPrev = .5), class = "simstudy::lengthMismatch" )
+ expect_error(logisticCoefs(d1, coefs=coefs, popPrev = .5, rr = -1), class = "simstudy::minError" )
+ expect_error(logisticCoefs(d1, coefs=coefs, popPrev = .5, rr = 2.1), class = "simpleError" )
+ expect_error(logisticCoefs(d1, coefs=coefs, popPrev = .5, rd = .6), class = "simstudy::valueError" )
+ expect_error(logisticCoefs(d1, coefs=coefs, popPrev = .5, rd = -.7), class = "simstudy::valueError" )
+ expect_error(logisticCoefs(d1, coefs=coefs, popPrev = .5, auc = .4), class = "simstudy::valueError" )
+ expect_error(logisticCoefs(d1, coefs=coefs, popPrev = 1.2), class = "simstudy::valueError" )
+ expect_error(logisticCoefs(d1, coefs=coefs3, popPrev = .4), class = "simstudy::wrongType" )
+
+})
diff --git a/vignettes/clustered.Rmd b/vignettes/clustered.Rmd
index 7e9ad7a..144f02a 100644
--- a/vignettes/clustered.Rmd
+++ b/vignettes/clustered.Rmd
@@ -7,6 +7,10 @@ vignette: >
\usepackage[utf8]{inputenc}
---
+```{r chunkname, echo=-1}
+data.table::setDTthreads(2)
+```
+
```{r, echo = FALSE, message = FALSE}
library(simstudy)
library(ggplot2)
diff --git a/vignettes/corelationmat.Rmd b/vignettes/corelationmat.Rmd
index 816af70..60f2e07 100644
--- a/vignettes/corelationmat.Rmd
+++ b/vignettes/corelationmat.Rmd
@@ -7,6 +7,9 @@ vignette: >
\usepackage[utf8]{inputenc}
---
+```{r chunkname, echo=-1}
+data.table::setDTthreads(2)
+```
```{r, echo = FALSE, message = FALSE}
library(simstudy)
diff --git a/vignettes/correlated.Rmd b/vignettes/correlated.Rmd
index 9fd6ffe..5e0b48e 100644
--- a/vignettes/correlated.Rmd
+++ b/vignettes/correlated.Rmd
@@ -7,6 +7,10 @@ vignette: >
\usepackage[utf8]{inputenc}
---
+```{r chunkname, echo=-1}
+data.table::setDTthreads(2)
+```
+
```{r, echo = FALSE, message = FALSE}
library(simstudy)
library(ggplot2)
diff --git a/vignettes/double_dot_extension.Rmd b/vignettes/double_dot_extension.Rmd
index 32a1c82..0ce7df5 100644
--- a/vignettes/double_dot_extension.Rmd
+++ b/vignettes/double_dot_extension.Rmd
@@ -7,6 +7,10 @@ vignette: >
\usepackage[utf8]{inputenc}
---
+```{r chunkname, echo=-1}
+data.table::setDTthreads(2)
+```
+
```{r, echo = FALSE, message = FALSE}
options(digits = 3)
@@ -196,7 +200,7 @@ d1 <- defData(d1, varname = "outcome", formula = "..effect[a, b]", dist="nonrand
```
```{r}
-dx <- genData(8, d1)
+dx <- genData(1000, d1)
dx
```
@@ -215,7 +219,7 @@ dsum <- dx[, .(outcome=mean(outcome)), keyby = .(a, b)]
ggplot(data = dx, aes(x = factor(a), y = outcome)) +
geom_jitter(aes(color = factor(b)), width = .2, alpha = .4, size = .2) +
geom_point(data = dsum, size = 2, aes(color = factor(b))) +
- geom_line(data = dsum, size = 1, aes(color = factor(b), group = factor(b))) +
+ geom_line(data = dsum, linewidth = 1, aes(color = factor(b), group = factor(b))) +
scale_color_manual(values = cbbPalette, name = " b") +
theme(panel.grid = element_blank()) +
xlab ("a")
diff --git a/vignettes/logisticCoefs.Rmd b/vignettes/logisticCoefs.Rmd
new file mode 100644
index 0000000..7e8c843
--- /dev/null
+++ b/vignettes/logisticCoefs.Rmd
@@ -0,0 +1,248 @@
+---
+title: "Targeted logistic model coefficients"
+output: rmarkdown::html_vignette
+vignette: >
+ %\VignetteIndexEntry{Logistic Model Simulation}
+ %\VignetteEngine{knitr::rmarkdown}
+ \usepackage[utf8]{inputenc}
+---
+
+```{r chunkname, echo=-1}
+data.table::setDTthreads(2)
+```
+
+```{r, echo = FALSE, message = FALSE}
+library(simstudy)
+library(ggplot2)
+library(scales)
+library(grid)
+library(gridExtra)
+library(data.table)
+
+options(digits = 2)
+```
+
+In `simstudy`, there are at least two ways to define a binary data generating process. The first is to operate on the scale of the proportion or probability using the *identity* link. This allows users to define a data generating process that reflects assumptions about risk ratios and risk differences when comparing two groups defined by an exposure or treatment. However, this process can become challenging when introducing other covariates, because it can be difficult to constrain the probabilities so that they fall between 0 and 1.
+
+The second approach works on the log-odds scale using a *logit* link, and is much more amenable to accommodating covariates. Unfortunately, this comes at the price of being able to easily generate specific risk ratios and risk differences, because all parameters are log-odds ratios. The overall (marginal) prevalence of an outcome in a population will vary depending on the distribution of covariates in that population, and the strengths (both absolute and relative) of the association of those covariates with the outcome. That is, the coefficients of a logistic model (including the intercept) determine the prevalence. The same is true regarding the risk ratio and risk difference (if there is one particular exposure or treatment of interest) and the AUC.
+
+Here we start with the simplest case where we have a target marginal proportion or prevalence, and then illustrate data generation with three other target statistics: **risk ratios**, **risk differences**, and **AUCs**.
+
+### Prevalence
+
+In this first example, we start with one set of assumptions for four covariates $x_1, x2 \sim N(0, 1)$, $b_1 \sim Bin(0.3)$, and $b_2 \sim Bin(0.7)$, and generate the outcome *y* with the following data generating process:
+
+$$ \text{logit}(y) = 0.15x_1 + 0.25x_2 + 0.10b_1 + 0.30b_2$$
+
+
+```{r}
+library(simstudy)
+library(ggplot2)
+library(data.table)
+
+coefs1 <- c(0.15, 0.25, 0.10, 0.30)
+
+d1 <- defData(varname = "x1", formula = 0, variance = 1)
+d1 <- defData(d1, varname = "x2", formula = 0, variance = 1)
+d1 <- defData(d1, varname = "b1", formula = 0.3, dist = "binary")
+d1 <- defData(d1, varname = "b2", formula = 0.7, dist = "binary")
+
+d1a <- defData(d1, varname = "y",
+ formula = "t(..coefs1) %*% c(x1, x2, b1, b2)",
+ dist = "binary", link = "logit")
+
+set.seed(48392)
+
+dd <- genData(500000, d1a)
+dd
+```
+
+The overall proportion of $y=1$ in this case is
+
+```{r}
+dd[, mean(y)]
+```
+
+If we have a desired marginal proportion of 0.40, then we can add an intercept of -0.66 to the data generating process:
+
+$$ \text{logit}(y) = -0.66 + 0.15x_1 + 0.25x_2 + 0.10b_1 + 0.30b_2$$
+
+The simulation now gives us the desired target:
+
+```{r}
+d1a <- defData(d1, varname = "y",
+ formula = "t(c(-0.66, ..coefs1)) %*% c(1, x1, x2, b1, b2)",
+ dist = "binary", link = "logit")
+
+genData(500000, d1a)[, mean(y)]
+```
+
+If we change the distribution of the covariates, so that $x_1 \sim N(1, 1)$, $x_2 \sim N(2, 1)$, $b_1 \sim Bin(0.5)$, and $b_2 \sim Bin(0.8)$, and the strength of the association of these covariates with the outcome so that
+
+$$ \text{logit}(y) = 0.20x_1 + 0.35x_2 + 0.20b_1 + 0.45b_2,$$
+
+the marginal proportion/prevalence (assuming no intercept term) also changes, going from 0.56 to 0.84:
+
+```{r}
+coefs2 <- c(0.20, 0.35, 0.20, 0.45)
+
+d2 <- defData(varname = "x1", formula = 1, variance = 1)
+d2 <- defData(d2, varname = "x2", formula = 3, variance = 1)
+d2 <- defData(d2, varname = "b1", formula = 0.5, dist = "binary")
+d2 <- defData(d2, varname = "b2", formula = 0.8, dist = "binary")
+
+d2a <- defData(d2, varname = "y",
+ formula = "t(..coefs2) %*% c(x1, x2, b1, b2)",
+ dist = "binary", link = "logit")
+
+genData(500000, d2a)[, mean(y)]
+```
+
+But under this new distribution, adding an intercept of -2.13 yields the desired target.
+
+$$ \text{logit}(y) = -2.13 + 0.20x_1 + 0.35x_2 + 0.20b_1 + 0.45b_2 $$
+
+
+
+```{r}
+d2a <- defData(d2, varname = "y",
+ formula = "t(c(-2.13, ..coefs2)) %*% c(1, x1, x2, b1, b2)",
+ dist = "binary", link = "logit")
+
+genData(500000, d1a)[, mean(y)]
+```
+
+#### Finding the intercept
+
+Where did those two intercepts come from? The [paper](https://bmcmedresmethodol.biomedcentral.com/articles/10.1186/s12874-023-01836-5){target="_blank"} by Peter Austin describes an iterative bisection procedure that takes a distribution of covariates and a set of coefficients to identify the intercept coefficient that yields the target marginal proportion or prevalence.
+
+The general idea of the algorithm is to try out series of different intercepts in an intelligent way that ends up at the right spot. (If you want the details for the algorithm, take a look at the [paper](https://bmcmedresmethodol.biomedcentral.com/articles/10.1186/s12874-023-01836-5){target="_blank"}.) The starting search range is pre-defined (we've used -10 to 10 for the intercept), and we start with an value of 0 for the initial intercept and simulate a large data set (the paper uses 1 million observations, but 100,000 seems to work just fine) and record the population prevalence. If we've overshot the target prevalence, we turn our attention to the range between -10 and 0, taking the average, which is -5. Otherwise, we focus on the range between 0 and 10. We iterate this way, choosing the range we need to focus on and setting the intercept at the mid-point (hence the name *bisection*). The algorithm will converge pretty quickly on the value of the intercept that gives the target population prevalence for the underlying covariate distribution and coefficient assumptions.
+
+In the current implementation in `simstudy`, the intercept is provided by a simple call to `logisticCoefs`. Here are the calls for the two sets of definitions in definition tables *d1* and *d2*.
+
+```{r}
+logisticCoefs(defCovar = d1, coefs = coefs1, popPrev = 0.40)
+logisticCoefs(defCovar = d2, coefs = coefs2, popPrev = 0.40)
+```
+
+### Risk ratios
+
+Just as the prevalence depends on the distribution of covariates and their association with the outcome, risk ratios comparing the outcome probabilities for two groups also depend on the additional covariates. The marginal risk ratio comparing treatment ($A =1$ to control ($A=0$) (given the distribution of covariates) is
+
+$$RR = \frac{P(y=1 | A = 1)}{P(y=1 | A = 0)}$$
+In the data generation process we use a log-odds ratio of -0.40 (odds ratio of approximately 0.67) in both cases, but we get different risk ratios (0.82 vs. 0.93), depending on the covariates (defined in *d1* and *d2*).
+
+```{r}
+d1a <- defData(d1, varname = "rx", formula = "1;1", dist = "trtAssign")
+d1a <- defData(d1a, varname = "y",
+ formula = "t(c(-0.40, ..coefs1)) %*% c(rx, x1, x2, b1, b2)",
+ dist = "binary", link = "logit"
+)
+
+dd <- genData(500000, d1a)
+dd[rx==1, mean(y)]/dd[rx==0, mean(y)]
+```
+
+```{r}
+d2a <- defData(d2, varname = "rx", formula = "1;1", dist = "trtAssign")
+d2a <- defData(d2a, varname = "y",
+ formula = "t(c(-0.40, ..coefs2)) %*% c(rx, x1, x2, b1, b2)",
+ dist = "binary", link = "logit"
+)
+
+dd <- genData(500000, d2a)
+dd[rx==1, mean(y)]/dd[rx==0, mean(y)]
+```
+
+By specifying both a population prevalence and a target risk ratio in the call to `logisticCoefs`, we can get the necessary parameters. When specifying the target risk ratio, it is required to be between 0 and 1/popPrev. A risk ratio cannot be negative, and the probability of the outcome under treatment cannot exceed 1 (which will happen if the risk ratio is greater than 1/popPrev).
+
+```{r}
+C1 <- logisticCoefs(d1, coefs1, popPrev = 0.40, rr = 0.85)
+C1
+```
+
+If we use $C_1$ in the data generation process, we will get a data set with the desired target prevalence and risk ratio:
+
+```{r}
+d1a <- defData(d1, varname = "rx", formula = "1;1", dist = "trtAssign")
+d1a <- defData(d1a, varname = "y",
+ formula = "t(..C1) %*% c(1, rx, x1, x2, b1, b2)",
+ dist = "binary", link = "logit"
+)
+
+dd <- genData(500000, d1a)
+```
+
+Here are the prevalence and risk ratio:
+
+```{r}
+dd[rx==0, mean(y)]
+dd[rx==1, mean(y)]/dd[rx==0, mean(y)]
+```
+
+You can do the same for the second set of assumptions.
+
+### Risk differences
+
+Risk differences have the same set of issues, and are handled in the same way. The risk difference is defined as
+
+$$ RD = P(y=1 | A = 1) - P(y=1 | A = 0)$$
+
+To get the coefficients related to a population prevalence of 0.40 and risk difference of -0.15 (so that the proportion in the exposure arm is 0.25), we use the *rd* argument:
+
+```{r}
+C1 <- logisticCoefs(d1, coefs1, popPrev = 0.40, rd = -0.15)
+C1
+```
+
+Again, using $C_1$ in the data generation process, we will get a data set with the desired target prevalence and risk difference:
+
+```{r}
+d1a <- defData(d1, varname = "rx", formula = "1;1", dist = "trtAssign")
+d1a <- defData(d1a, varname = "y",
+ formula = "t(..C1) %*% c(1, rx, x1, x2, b1, b2)",
+ dist = "binary", link = "logit"
+)
+
+dd <- genData(500000, d1a)
+
+dd[rx==0, mean(y)]
+dd[rx==1, mean(y)] - dd[rx==0, mean(y)]
+```
+
+### AUC
+
+The AUC is another commonly used statistic to evaluate a logistic model. We can use `logisticCoefs` to find the parameters that will allow us to generate data from a model with a specific AUC. To get the coefficients related to a population prevalence of 0.40 and an AUC of 0.85, we use the *auc* argument (which must be between 0.5 and 1):
+
+```{r}
+C1 <- logisticCoefs(d1, coefs1, popPrev = 0.40, auc = 0.85)
+C1
+```
+
+Again, using $C_1$ in the data generation process, we will get a data set with the desired target prevalence and the AUC (calculated here using the `lrm` function in the `rms` package:
+
+```{r}
+d1a <- defData(d1, varname = "y",
+ formula = "t(..C1) %*% c(1, x1, x2, b1, b2)",
+ dist = "binary", link = "logit"
+)
+
+dd <- genData(500000, d1a)
+
+dd[, mean(y)]
+
+fit <- rms::lrm(y ~ x1 + x2 + b1 + b2, data = dd)
+fit$stats["C"]
+
+```
+
+
+
+References:
+
+Austin, Peter C. "The iterative bisection procedure: a useful
+tool for determining parameter values in data-generating processes in
+Monte Carlo simulations." BMC Medical Research Methodology 23,
+no. 1 (2023): 1-10.
+
+
\ No newline at end of file
diff --git a/vignettes/longitudinal.Rmd b/vignettes/longitudinal.Rmd
index 084c2f5..7032300 100644
--- a/vignettes/longitudinal.Rmd
+++ b/vignettes/longitudinal.Rmd
@@ -7,6 +7,10 @@ vignette: >
\usepackage[utf8]{inputenc}
---
+```{r chunkname, echo=-1}
+data.table::setDTthreads(2)
+```
+
```{r, echo = FALSE, message = FALSE}
library(simstudy)
library(ggplot2)
diff --git a/vignettes/missing.Rmd b/vignettes/missing.Rmd
index 4aa2f74..ce4f5d8 100644
--- a/vignettes/missing.Rmd
+++ b/vignettes/missing.Rmd
@@ -7,6 +7,10 @@ vignette: >
\usepackage[utf8]{inputenc}
---
+```{r chunkname, echo=-1}
+data.table::setDTthreads(2)
+```
+
```{r, echo = FALSE, message = FALSE}
library(simstudy)
library(ggplot2)
diff --git a/vignettes/ordinal.Rmd b/vignettes/ordinal.Rmd
index 162d5f6..ba365ff 100644
--- a/vignettes/ordinal.Rmd
+++ b/vignettes/ordinal.Rmd
@@ -7,6 +7,10 @@ vignette: >
\usepackage[utf8]{inputenc}
---
+```{r chunkname, echo=-1}
+data.table::setDTthreads(2)
+```
+
```{r, echo = FALSE, message = FALSE}
library(simstudy)
library(ggplot2)
diff --git a/vignettes/simstudy.Rmd b/vignettes/simstudy.Rmd
index 654a157..a97541b 100644
--- a/vignettes/simstudy.Rmd
+++ b/vignettes/simstudy.Rmd
@@ -7,6 +7,10 @@ vignette: >
\usepackage[utf8]{inputenc}
---
+```{r chunkname, echo=-1}
+data.table::setDTthreads(2)
+```
+
```{r, echo = FALSE, message = FALSE}
library(simstudy)
diff --git a/vignettes/spline.Rmd b/vignettes/spline.Rmd
index 5a18e33..8d07ae4 100644
--- a/vignettes/spline.Rmd
+++ b/vignettes/spline.Rmd
@@ -7,6 +7,10 @@ vignette: >
\usepackage[utf8]{inputenc}
---
+```{r chunkname, echo=-1}
+data.table::setDTthreads(2)
+```
+
```{r, echo = FALSE, message = FALSE}
library(simstudy)
library(ggplot2)
diff --git a/vignettes/survival.Rmd b/vignettes/survival.Rmd
index c59f1b6..25b48ab 100644
--- a/vignettes/survival.Rmd
+++ b/vignettes/survival.Rmd
@@ -7,6 +7,9 @@ vignette: >
\usepackage[utf8]{inputenc}
---
+```{r chunkname, echo=-1}
+data.table::setDTthreads(2)
+```
```{r, echo = FALSE, message = FALSE}
diff --git a/vignettes/treat_and_exposure.Rmd b/vignettes/treat_and_exposure.Rmd
index 3420bf9..f19b3e0 100644
--- a/vignettes/treat_and_exposure.Rmd
+++ b/vignettes/treat_and_exposure.Rmd
@@ -7,6 +7,10 @@ vignette: >
\usepackage[utf8]{inputenc}
---
+```{r chunkname, echo=-1}
+data.table::setDTthreads(2)
+```
+
```{r, echo = FALSE, message=FALSE}
library(simstudy)