version 0.1.0

cran · Mar 18, 2020 · 0fc5009 · 0fc5009
commit 0fc5009
Show file tree

Hide file tree

Showing 36 changed files with 1,542 additions and 0 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,23 @@
+Package: cfdecomp
+Type: Package
+Title: Counterfactual Decomposition: MC Integration of the G-Formula
+Version: 0.1.0
+Authors@R: c(
+    person("Maarten", "Bijlsma", role=c("aut", "cre"), email=  "maarten.bijlsma@gmail.com"),
+    person("Nikkil", "Sudharsanan", role=c("aut")),
+    person("Peng", "Li", role=c("ctb"))
+    )
+Maintainer: Maarten Bijlsma <maarten.bijlsma@gmail.com>
+Description: Provides a set of functions for counterfactual decomposition (cfdecomp). The functions available in this package decompose differences in an outcome attributable to a mediating variable (or sets of mediating variables) between groups based on counterfactual (causal inference) theory. By using Monte Carlo (MC) integration (simulations based on empirical estimates from multivariable models) we provide added flexibility compared to existing (analytical) approaches, at the cost of computational power or time. The added flexibility means that we can decompose difference between groups in any outcome or and with any mediator (any variable type and distribution). See Sudharsanan & Bijlsma (2019) <doi:10.4054/MPIDR-WP-2019-004> for more information.
+Depends: R (>= 3.5.0)
+License: GPL-3
+Encoding: UTF-8
+LazyData: true
+RoxygenNote: 7.0.2
+NeedsCompilation: no
+Packaged: 2020-03-10 14:30:12 UTC; bijlsma
+Author: Maarten Bijlsma [aut, cre],
+  Nikkil Sudharsanan [aut],
+  Peng Li [ctb]
+Repository: CRAN
+Date/Publication: 2020-03-18 13:40:02 UTC
diff --git a/MD5 b/MD5
@@ -0,0 +1,35 @@
+f6f9a781c753eb20e9411ec7a308e4b2 *DESCRIPTION
+eef28e582a1e2fbe695d599bfc2330fd *NAMESPACE
+ee882d7bd6abc7c00a353eb47774a380 *NEWS.md
+d4b39d67a5235dd293ddeef7e62aae16 *R/cfd.example.data.R
+0a1e0e4804ee07a41ca282d6bfe9582d *R/cfd.mean.R
+a1848fb893db45320191e4b43f2162b8 *R/cfd.quantile.R
+cfa33a54ceeab2e31a6274e3eaefda86 *R/cfd.semipar.mean.R
+d13281ce1c66b59fac640721e6fe6189 *R/cfd.semipar.quantile.R
+d658025abccaddc47e34201e43dff375 *R/conv.mean.R
+5274a8ead61f4f42923aa64d71e00a6a *README.md
+0c88d36cb420add065bb8d532b3f3169 *data/cfd.example.data.rda
+a64fa4a2b5280a79150a6c0c64159940 *man/cfd.example.data.Rd
+9166e7bc9090b0fa135926eab0d16710 *man/cfd.mean.Rd
+27831b9092dcf2ba1a672d6b3906a81c *man/cfd.quantile.Rd
+4cd301ed860b2001026dde00737725d2 *man/cfd.semipar.mean.Rd
+2342900d1143f43b4064ea56701cff2d *man/cfd.semipar.quantile.Rd
+6a6abeb9971d4484c6b91efcbcf91dda *man/conv.mean.Rd
+e4e2441cf579ab55d8931e3e5c97913d *man/figures/README-example-1.png
+30b62360cb1ca242763d35ba0561a78d *man/figures/README-example-10.png
+c655181e67bdc2372dfb2092ce2dff8a *man/figures/README-example-11.png
+3adf56e5631e4fd9e0c2d6571c43f012 *man/figures/README-example-12.png
+d65ad07144fd97559432ea3e977b9dfa *man/figures/README-example-13.png
+93592795f73e39e089802cb0b46520a8 *man/figures/README-example-14.png
+511310b0d64348e68da0c0f2fa0eac33 *man/figures/README-example-15.png
+dc64b70c02d9cf978f0167189543bd21 *man/figures/README-example-16.png
+09fb8f2bc166048a89277d59b052835f *man/figures/README-example-17.png
+4b40fbdbb597346c144af717c1632994 *man/figures/README-example-18.png
+c2720f7429bc2151d4f24e1b8bf261cf *man/figures/README-example-2.png
+3e829ca0dc06d0c2044ccd5df68ade9d *man/figures/README-example-3.png
+3e9f560073b44d6847ee5e2cc3661bf7 *man/figures/README-example-4.png
+6ac25130f31f497c0bdfa8bb2f990d3b *man/figures/README-example-5.png
+e10ed9cacc4f171ba9529bdd85fb5adb *man/figures/README-example-6.png
+76c166804ac8d428a57f3e58fdaba872 *man/figures/README-example-7.png
+63975c2bc601b43491a85ea041c7fe41 *man/figures/README-example-8.png
+a85c5e594bbdbbb5d0a67c27d876ae60 *man/figures/README-example-9.png
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,8 @@
+# Generated by roxygen2: do not edit by hand
+
+export(cfd.mean)
+export(cfd.quantile)
+export(cfd.semipar.mean)
+export(cfd.semipar.quantile)
+export(conv.mean)
+import(stats)
diff --git a/NEWS.md b/NEWS.md
@@ -0,0 +1,6 @@
+
+# cfdecomp v0.1.0
+
+This is a first release of the cfdecomp package, which has so far only been available on github under the name cfdecomp.
+
+See DESCRIPTION for more information.
diff --git a/R/cfd.example.data.R b/R/cfd.example.data.R
@@ -0,0 +1,14 @@
+#' @title Example Data for the cfdecomp package
+#'
+#' @description A dataframe with artificially generated data, intended to be used in a demonstration of the functions in this package.
+#'
+#' @format a dataframe with 6 columns, which are:
+#' \describe{
+#' \item{SES}{a factor variable with 3 levels, denoting three groups to be compared. SES stands for socio-economic status.}
+#' \item{age}{a continuous variable going from 40 to 65. Age could stand for age in years.}
+#' \item{med.gauss}{a mediator that is a continuous variable and normally distributed.}
+#' \item{med.binom}{a mediator that is binomial.}
+#' \item{med.pois}{a mediator that is a count variable and poisson distributed.}
+#' \item{out.gauss}{an outcome that is a continuous variable and normally distributed.}
+#' }
+"cfd.example.data"
diff --git a/R/cfd.mean.R b/R/cfd.mean.R
@@ -0,0 +1,191 @@
+#'
+#' @title Mean Decomposition: parametric version
+#'
+#' @description Decompose the mean difference in outcome Y between groups.
+#'
+#' @param formula.y the \code{\link{formula}} for the multivariable model (see \code{\link{glm}}) for the outcome Y.
+#' @param formula.m the \code{\link{formula}} for the multivariable model (see \code{\link{glm}}) for the mediator M.
+#' @param mediator the column name of the mediator M.
+#' @param group column name of a factor variable containing the group identifier.
+#' @param data a data frame containing the variables in the model.
+#' @param family.y a description of the error distribution to be used in the model, see \code{\link{family}} for details. For the outcome variable any member of the \code{glm} family can be used.
+#' @param family.m a description of the error distribution to be used in the model, see \code{\link{family}} for details. For the mediator, currently \code{gaussian}, \code{binomial} and \code{poisson} are supported.
+#' @param bs.size the number of bootstrap iterations to be performed.
+#' @param mc.size the number of Monte Carlo iterations to be performed (more = more MC error reduction).
+#' @param alpha the alpha level used to construct confidence intervals (0.05 = 95 percent confidence interval).
+#' @param sample.resid if the \code{mediator} is Gaussian, should the simulation sample from the residuals of the linear regression model of the mediator to approximate the empirical distribution of the mediator in the simulation (Monte Carlo integration) (if so, set to \code{TRUE}), or should it sample from a Gaussian distribution with the standard deviation of the mediator? If the true distribution of the continuous mediator is not very Gaussian, the former may be preferred.
+#' @param print.iteration print the bootstrap iteration
+#'
+#' @return \code{out_nc_m} returns the mean level of the mediator under the natural course, which is a value that should be close to the empirically observed value of the mediator for each group. \code{out_nc_quantile} provides the \code{alpha/2} and \code{1-alpha/2} bootstrap quantiles for this mean (AKA bootstrap percentile confidence intervals). \code{out_nc_y} and \code{out_nc_quantile_y} provide the corresponding values, but then for the outcome variable Y. Similarly, \code{out_cf_m}, \code{out_cf_quantile_m},\code{out_cf_y}, and \code{out_cf_quantile_y} provide the corresponding values for the counterfactual scenario where the mediators of the groups are equalized. \code{mediation} returns the proportion mediated by setting the intervened on mediator to be equal in level to the reference group and \code{mediation_quantile} returns the 1-alpha confidence interval. \code{mc_conv_info_m} and \code{mc_conv_info_y} provide information that can help determine the number of Monte Carlo and Bootstrap iterations needed to achieve stability. See the \code{Examples} for more information.
+#' @export
+#'
+#' @examples
+#' set.seed(100)
+#' # the decomposition functions in our package are computationally intensive
+#' # to make the example run quick, I perform it on a subsample (n=250) of the data:
+#' cfd.example.sample <- cfd.example.data[sample(250),]
+#' mean.results.1 <- cfd.mean(formula.y='out.gauss ~ SES + med.gauss + med.binom + age',
+#'                            formula.m='med.gauss ~ SES + age',
+#'                            mediator='med.gauss',
+#'                            group='SES',
+#'                            data=cfd.example.sample,
+#'                            family.y='gaussian',
+#'                            family.m='gaussian',
+#'                            bs.size=50,
+#'                            mc.size=10,
+#'                            alpha=0.05)
+#' # also note that normally we would recommend an bs.size of 250+
+#' # and an mc.size of 50+
+#' # let's interpret the output of this function:
+#' mean(mean.results.1$out_nc_y[,2] - mean.results.1$out_nc_y[,1])
+#' # and after giving the gaussian mediator of SES group 2 the distribution of the one in group 1
+#' # the difference becomes:
+#' mean(mean.results.1$out_cf_y[,2] - mean.results.1$out_nc_y[,1])
+#' # so the % of the outcome Y that is due to differences between the two SES groups
+#' # in the gaussian mediator is
+#' mean(1-(mean.results.1$out_cf_y[,2] - mean.results.1$out_nc_y[,1]) /
+#' (mean.results.1$out_nc_y[,2] - mean.results.1$out_nc_y[,1]))
+#' # we can also get this number, and the one from the comparison of the other SES group
+#' # with group 1, straight from the object
+#' mean.results.1$mediation
+#' # and we can get the 1-alpha CI for each:
+#' mean.results.1$mediation_quantile
+#' # see README.md for a more detailed description of the functions in this package.
+#' @import stats
+#'
+#'
+cfd.mean <- function(formula.y,formula.m,mediator,group,
+                     data,
+                     family.y = 'binomial',
+                     family.m = 'binomial',
+                     bs.size = 1000,
+                     mc.size=50,
+                     alpha=0.05,
+                     sample.resid=FALSE,
+                     print.iteration=FALSE) {
+
+  ## envir check
+  call <- match.call()
+  if (missing(data))
+    data <- environment(formula.y)
+  mf <- match.call(expand.dots = FALSE)
+
+  ## get initial fit of outcome y
+  ini_fit.y <- glm(formula = formula.y,
+                   data=data, family=family.y)
+  ini_fit.y$model_matrix <- model.matrix(as.formula(formula.y), data = data)
+  fam.y <- family(ini_fit.y)
+
+  ## get initial fit of mediator m
+  ini_fit.m <- glm(formula = formula.m,
+                   data=data, family=family.m)
+  ini_fit.m$model_matrix <- model.matrix(as.formula(formula.m), data = data)
+  fam.m <- family(ini_fit.m)
+
+  ## loop over bootstrap samples
+  out_nc_m <- matrix(NA,nrow=bs.size,ncol=nlevels(data[,group]),dimnames = list(1:bs.size,levels(data[,group])))
+  out_cf_m <- matrix(NA,nrow=bs.size,ncol=nlevels(data[,group]),dimnames = list(1:bs.size,levels(data[,group])))
+  out_nc_y <- matrix(NA,nrow=bs.size,ncol=nlevels(data[,group]),dimnames = list(1:bs.size,levels(data[,group])))
+  out_cf_y <- matrix(NA,nrow=bs.size,ncol=nlevels(data[,group]),dimnames = list(1:bs.size,levels(data[,group])))
+
+  ## temp matrix to save mc results
+  temp_nc_m <- matrix(NA, nrow=mc.size, ncol=nlevels(data[,group]))
+  temp_cf_m <- matrix(NA, nrow=mc.size, ncol=nlevels(data[,group]))
+  temp_nc_y <- matrix(NA, nrow=mc.size, ncol=nlevels(data[,group]))
+  temp_cf_y <- matrix(NA, nrow=mc.size, ncol=nlevels(data[,group]))
+
+  for(i in 1:bs.size) {
+    if(print.iteration==TRUE){print(i)}
+
+    ## resampling
+    resample_ind <- sample(nrow(data), replace=TRUE)
+    data_bs <- data[resample_ind,]
+
+    ## refit models for y and m
+    bs_fit.y <- update(ini_fit.y, data = data_bs)
+    coef_bs.y <- coef(bs_fit.y)
+
+    bs_fit.m <- update(ini_fit.m, data = data_bs)
+    coef_bs.m <- coef(bs_fit.m)
+    if(family.m[1]=='gaussian' & sample.resid==FALSE) {sd.ref.m <- sd(bs_fit.m$residuals[data_bs[,group]==levels(data_bs[,group])[1]])}
+    if(family.m[1]=='gaussian' & sample.resid==TRUE) {resid.ref.m <- bs_fit.m$residuals[data_bs[,group]==levels(data_bs[,group])[1]]}
+
+    # save group identifier in an additional column since
+    # we overwrite it to create counterfactuals
+    data_bs$truegroup <- data_bs[,group]
+
+    ## start Monte Carlo loop
+    ## ! The parametric version does not equalize mediator values over strata
+    for(ii in 1:mc.size){
+
+      # save and overwrite data_mc since we create counterfactuals later
+      data_mc <- data_bs
+
+      ## natural course simulation ##
+      # simulate mediator
+      pred_mean_m <- predict(bs_fit.m,data_mc,type='response')
+      n <- length(pred_mean_m)
+      if(family.m[1]=='poisson') {data_mc[,mediator] <- rpois(n,pred_mean_m)} else if
+      (family.m[1]=='binomial') {data_mc[,mediator] <- rbinom(n,1,pred_mean_m)} else if
+      (family.m[1]=='gaussian' & sample.resid==FALSE) {data_mc[,mediator] <- rnorm(n,pred_mean_m,sd=sd.ref.m)} else if
+      (family.m[1]=='gaussian' & sample.resid==TRUE) {data_mc[,mediator] <- pred_mean_m + sample(resid.ref.m,n,replace=TRUE)}
+      temp_nc_m[ii,] <-  tapply(data_mc[,mediator], list(data_mc[,group]),mean,na.rm=TRUE)
+
+      # simulate outome
+      # ! Note that this function predicts means only because it compares means
+      # ! Hence the entire distribution of values (of the outcome) is not re-created
+      # ! If that is desired, see the function for the quantiles instead.
+      pred_mc_nc <- predict(bs_fit.y,data_mc,type='response')
+      temp_nc_y[ii,] <-  tapply(pred_mc_nc, list(data_mc[,group]),mean,na.rm=TRUE)
+
+      ## counterfactual simulation ##
+      # equalize distribution of mediator
+      # this is done by assigning everyone the reference group and then (re)predicting
+      data_mc[,group][which(data_mc[,group] %in% levels(data_mc[,group])[-1])] <- levels(data_mc[,group])[1]
+      pred_mean_m <- predict(bs_fit.m,data_mc,type='response')
+      if(family.m[1]=='poisson') {data_mc[,mediator] <- rpois(n,pred_mean_m)} else if
+      (family.m[1]=='binomial') {data_mc[,mediator] <- rbinom(n,1,pred_mean_m)} else if
+      (family.m[1]=='gaussian' & sample.resid==FALSE) {data_mc[,mediator] <- rnorm(n,pred_mean_m,sd=sd.ref.m)} else if
+      (family.m[1]=='gaussian' & sample.resid==TRUE) {data_mc[,mediator] <- pred_mean_m + sample(resid.ref.m,n,replace=TRUE)}
+      temp_cf_m[ii,] <-  tapply(data_mc[,mediator], list(data_mc$truegroup),mean,na.rm=TRUE)
+
+      # set group identifier back to true levels
+      data_mc[,group] <- data_mc$truegroup
+      # simulate outcome
+      pred_mc_cf <- predict(bs_fit.y,data_mc,type='response')
+      temp_cf_y[ii,] <-  tapply(pred_mc_cf, list(data_mc$truegroup),mean,na.rm=TRUE)
+
+    }
+
+    # for thef first bootstrap iteration, also save mc information
+    # to be used for convergence information
+    if(i==1) {
+      mc_conv_info_m <- apply(temp_nc_m,2,conv.mean)
+      mc_conv_info_y <- apply(temp_nc_y,2,conv.mean)
+    }
+
+    ## save results for this bootstrap iteration
+    out_nc_m[i,] <- apply(temp_nc_m,2,mean,na.rm=TRUE)
+    out_cf_m[i,] <- apply(temp_cf_m,2,mean,na.rm=TRUE)
+    out_nc_y[i,] <- apply(temp_nc_y,2,mean,na.rm=TRUE)
+    out_cf_y[i,] <- apply(temp_cf_y,2,mean,na.rm=TRUE)
+
+  }
+  return(list(out_nc_m=out_nc_m,
+              out_cf_m=out_cf_m,
+              out_nc_quantile_m=apply(out_nc_m,2,quantile,c(alpha/2,0.5,1-alpha/2)),
+              out_cf_quantile_m=apply(out_cf_m,2,quantile,c(alpha/2,0.5,1-alpha/2)),
+
+              out_nc_y=out_nc_y,
+              out_cf_y=out_cf_y,
+              out_nc_quantile_y=apply(out_nc_y,2,quantile,c(alpha/2,0.5,1-alpha/2)),
+              out_cf_quantile_y=apply(out_cf_y,2,quantile,c(alpha/2,0.5,1-alpha/2)),
+
+              mediation=apply(1-(out_cf_y - out_nc_y[,1]) / (out_nc_y - out_nc_y[,1]),2,mean)[-1],
+              mediation_quantile=apply(1-(out_cf_y - out_nc_y[,1]) / (out_nc_y - out_nc_y[,1]),2,quantile,probs=c(alpha/2,1-alpha/2),na.rm=TRUE)[,-1],
+
+              mc_conv_info_m=mc_conv_info_m,
+              mc_conv_info_y=mc_conv_info_y
+  )
+  )
+}