version 1.0.0

cran · Nov 8, 2022 · 0177557 · 0177557
commit 0177557
Show file tree

Hide file tree

Showing 39 changed files with 3,164 additions and 0 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,24 @@
+Package: optrefine
+Title: Optimally Refine Strata
+Version: 1.0.0
+Authors@R: 
+    person("Katherine", "Brumberg", , "kbrum@wharton.upenn.edu", role = c("aut", "cre"),
+           comment = c(ORCID = "0000-0002-5193-6250"))
+Description: Splits initial strata into refined strata that optimize covariate balance. For more information, please email the author for a copy of the accompanying manuscript. To solve the linear program, the 'Gurobi' commercial optimization software is recommended, but not required. The 'gurobi' R package can be installed following the instructions at <https://www.gurobi.com/documentation/9.1/refman/ins_the_r_package.html>.
+URL: https://github.com/kkbrum/optrefine,
+        https://kkbrum.github.io/optrefine/,
+        https://www.gurobi.com/documentation/9.1/refman/ins_the_r_package.html
+BugReports: https://github.com/kkbrum/optrefine/issues
+License: GPL (>= 3)
+Encoding: UTF-8
+LazyData: true
+RoxygenNote: 7.2.1
+Depends: R (>= 2.10), MASS, Rglpk, sampling, ggplot2
+Suggests: covr, gurobi, testthat (>= 3.0.0)
+Config/testthat/edition: 3
+NeedsCompilation: no
+Packaged: 2022-11-07 21:31:34 UTC; katherine
+Author: Katherine Brumberg [aut, cre] (<https://orcid.org/0000-0002-5193-6250>)
+Maintainer: Katherine Brumberg <kbrum@wharton.upenn.edu>
+Repository: CRAN
+Date/Publication: 2022-11-08 14:20:08 UTC
diff --git a/MD5 b/MD5
@@ -0,0 +1,38 @@
+1008c86f87f1cec8489d3621ffb84a0b *DESCRIPTION
+8fb9529835f8f530d0475b23c06fde16 *NAMESPACE
+745d8da0e01e6eb832f37a6401ffa41c *NEWS.md
+d36c1f69a84e2dc2b8396dceb074a056 *R/best_split.R
+ac87b09049dcc3a95849952f67492d50 *R/calc_smds.R
+e435e6b633e5dc5bb8822d7927b9b518 *R/data.R
+916a3b2236caf42e29db5981958e2d5f *R/plot.strat.R
+0897c5867eb877de28dc55d73ea883a2 *R/print.strat.R
+a402effda4b4550380e95d57a0e6f10f *R/prop_strat.R
+f187a75e17d28b2417852c02a0c9e198 *R/rand_pvals.R
+d54e57cbc65f12c2df1c8a32c4568c0f *R/refine.R
+033c3d830a9ba8467a215eff540e898d *R/split_stratum.R
+5deb71c328e61bab62bf2c6268ece6b9 *R/strat.R
+4a8c621c5d1869b340fcc229c6157ee1 *README.md
+c055510b95ea37125a153dd198e94f10 *data/rhc_X.rda
+490380f001dd2b8f071631d722c121bb *man/best_split.Rd
+c6789311b1ce886a17750f3ed1b94c1a *man/calc_smds.Rd
+194589f407262971d46a2a6d818e7851 *man/new_strat.Rd
+bccacc1bedbb368d34b2c3bb7730e19a *man/plot.strat.Rd
+86c2349e601409a289abbe144a38c0d9 *man/print.strat.Rd
+20b714e0ee6f267b55387dbcc7babb3b *man/prop_strat.Rd
+4ae39dc84751eafdafe804c1920e221c *man/rand_pvals.Rd
+a42526b0bcf091e4e720fe08c0477e43 *man/refine.Rd
+d06871afcd45d60be2be4b609000a12c *man/rhc_X.Rd
+6943b1cca35a0a32deeb832336a7e104 *man/split_stratum.Rd
+aeb5396c85e22aea95e68e48b08e0ce1 *man/strat.Rd
+333ac8228f933b9716ae395a74610942 *man/table_rand_pvals.Rd
+f25c919c6fadb0560a218fba5505dbf9 *man/validate_strat.Rd
+c821b7a00f4eabf22d695f1aa808c123 *tests/testthat.R
+dc2e1ff0e3a24aefa3df6e1fcfdf8d78 *tests/testthat/test-best_split.R
+6101428dbf7884370862bd992f969f63 *tests/testthat/test-calc_smds.R
+035968beace58e1b884ee789f7bf90e6 *tests/testthat/test-plot.R
+5c4836e1537ba854c33f06fc7d23947c *tests/testthat/test-print.R
+ca92e980837ce86f954bd3ec8ee308ef *tests/testthat/test-prop_strat.R
+209d6646b949f662d5ff9dc16ce26d20 *tests/testthat/test-rand_pvals.R
+2b58ce5caf2e99e031a834bac9541e26 *tests/testthat/test-refine.R
+5df7e229f1e70f6631903c98183dbf3d *tests/testthat/test-split_stratum.R
+43ef1c7a8282079f99513045cb670436 *tests/testthat/test-strat.R
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,17 @@
+# Generated by roxygen2: do not edit by hand
+
+S3method(plot,strat)
+S3method(print,strat)
+export(best_split)
+export(calc_smds)
+export(prop_strat)
+export(rand_pvals)
+export(refine)
+export(split_stratum)
+export(strat)
+export(table_rand_pvals)
+import(MASS)
+import(Rglpk)
+import(ggplot2)
+import(sampling)
+import(stats)
diff --git a/NEWS.md b/NEWS.md
@@ -0,0 +1,3 @@
+# optrefine 1.0.0
+
+* Added a `NEWS.md` file to track changes to the package.
diff --git a/R/best_split.R b/R/best_split.R
@@ -0,0 +1,113 @@
+#' Find the best split for a stratum
+#'
+#' Runs \code{\link{split_stratum}()} many times and selects the best result.
+#'
+#' @inheritParams split_stratum
+#' @param nc_list a list of choices for the `nc` parameter in \code{\link{split_stratum}()}.
+#' Each element is a vector with entries corresponding to the number of control
+#' units that should be placed in each new stratum
+#' @param nt_list a list of choices for the `nt` parameter in \code{\link{split_stratum}()}.
+#' Each element is a vector with entries corresponding to the number of treated
+#' units that should be placed in each new stratum
+#' @param min_split a numeric specifying the minimum number of each control and treated units
+#'  to be tolerated in a stratum. Any combination of elements
+#' from `nc_list` and `nt_list` that violate this are skipped
+#'
+#' @return A list containing the following elements:
+#' \itemize{
+#'     \item{valuesIP, valuesLP: }{matrices containing integer and linear programming
+#' scaled objective values for each sample size tried, with rows corresponding to the
+#' elements of `nc_list` and columns corresponding to the elements of `nt_list`}
+#'     \item{besti, bestj: }{indices of the best sample sizes in `nc_list` and in `nt_list`, respectively}
+#'     \item{n_smds: }{number of standardized mean differences contributing to the objective values
+#' (multiply the scaled objective values by this number to get the true objective values)}
+#'     \item{n_fracs: }{number of units with fractional LP solutions in the best split}
+#'     \item{rand_c_prop, rand_t_prop: }{proportions of the control and treated units in each
+#'     stratum that were selected with randomness for the best split}
+#'     \item{pr: }{linear programming solution for the best split,
+#'     with rows corresponding to the strata and columns to the units}
+#'     \item{selection: }{vector of selected strata for each unit
+#'     in the initial stratum to be split for the best split}
+#' }
+#'
+#' @export
+#' @examples
+#'
+#' # Generate a small data set
+#' set.seed(25)
+#' samp <- sample(1:nrow(rhc_X), 1000)
+#' cov_samp <- sample(1:26, 10)
+#'
+#' # Create some strata
+#' ps <- prop_strat(z = rhc_X[samp, "z"],
+#'                  X = rhc_X[samp, cov_samp], nstrata = 5)
+#'
+#' # Save the sample sizes
+#' tab <- table(ps$z, ps$base_strata)
+#'
+#' # Choose the best sample sizes among the options provided
+#' best_split(z = ps$z, X = ps$X, strata = ps$base_strata, ist = 1,
+#'            nc_list = list(c(floor(tab[1, 1] * 0.25), ceiling(tab[1, 1] * 0.75)),
+#'                           c(floor(tab[1, 1] * 0.4), ceiling(tab[1, 1] * 0.6))),
+#'            nt_list = list(c(floor(tab[2, 1] * 0.3), ceiling(tab[2, 1] * 0.7))),
+#'            min_split = 5)
+
+
+best_split <- function(z, X, strata, ist, nc_list, nt_list,
+                       wMax = 5, wEach = 1, solver = "Rglpk",
+                       integer = FALSE, min_split = 10, threads = threads){
+
+  stopifnot(length(ist) == 1)
+  if (!all(lapply(nc_list, sum) == sum(1-z[strata == ist]))) {
+    stop("Not all control sample size options sum to the number of controls", call. = FALSE)
+  }
+  if (!all(lapply(nt_list, sum) == sum(z[strata == ist]))) {
+    stop("Not all treated sample size options sum to the number of treated units", call. = FALSE)
+  }
+  o <- vector(mode="list", length = length(nc_list) * length(nt_list))
+  vIP <- matrix(NA, length(nc_list), length(nt_list),
+                dimnames = list("nc_option" = 1:length(nc_list), "nt_option" = 1:length(nt_list)))
+  vLP <- vIP
+  bestv <- Inf
+  besti <- NA
+  bestj <- NA
+  k <- 0
+  allBAD <- TRUE
+
+  for (i in 1:length(nc_list)) {
+    for (j in 1:length(nt_list)) {
+      nc <- nc_list[[i]]
+      nt <- nt_list[[j]]
+      k <- k+1
+      empty_st <- any((nc == 0) & (nt == 0))
+      if (is.null(min_split) || empty_st || (min(nc) >= min_split & min(nt) >= min_split)) {
+        allBAD <- FALSE # At least one split was possible
+        ok <- split_stratum(z = z, X = X, strata = strata,
+                            ist = ist, nc = nc, nt = nt,
+                            wMax = wMax, wEach = wEach,
+                            solver = solver, integer = integer,
+                            threads = threads)
+        o[[k]] <- ok
+        vIP[i,j] <- ok$valueIP
+        vLP[i,j] <- ok$valueLP
+        if (ok$valueIP < bestv) {
+          bestv <- ok$valueIP
+          besti <- i
+          bestj <- j
+          bestk <- k
+        }
+      }
+    }
+  }
+
+  if (allBAD){
+    stop("Error in call to best_split(). No pair of elements from nc_list and nt_list satisfied the min_split requirement.", call. = FALSE)
+  }
+  k <- bestk
+
+  return(list(valuesIP=vIP, valuesLP=vLP, besti=besti, bestj=bestj,
+              n_smds = o[[k]]$n_smds, n_fracs = o[[k]]$n_fracs,
+              rand_c_prop = o[[k]]$rand_c_prop, rand_t_prop = o[[k]]$rand_t_prop,
+              pr = o[[k]]$pr, selection=o[[k]]$selection))
+}
+
diff --git a/R/calc_smds.R b/R/calc_smds.R
@@ -0,0 +1,94 @@
+#' Calculate standardized mean differences for initial and refined strata
+#'
+#' Summarizes initial and/or refined strata in terms of standardized mean differences (SMDs).
+#'
+#' @inheritParams refine
+#' @param base_strata optional initial stratification for which to calculate SMDs;
+#' only used if `object` is not supplied
+#' @param refined_strata optional refined stratification for which to calculate SMDs;
+#' only used if `object` is not supplied
+#' @param abs boolean whether to return absolute standardized mean differences or raw values.
+#' Default is TRUE for absolute values
+#'
+#' @return List with two elements, "base" and "refined", each containing
+#' a matrix of standardized mean differences for each stratum (row) and covariate (column).
+#'
+#' @export
+#'
+#' @examples
+#' # Choose 500 patients and 5 covariates to work with for the example
+#' set.seed(15)
+#' samp <- sample(1:nrow(rhc_X), 500)
+#' cov_samp <- sample(1:26, 5)
+#'
+#' # Let it create propensity score strata for you and then refine them
+#' ref <- refine(X = rhc_X[samp, cov_samp], z = rhc_X[samp, "z"])
+#'
+#' # Look at covariate balance for propensity score and refined strata
+#' calc_smds(object = ref)
+#'
+
+calc_smds <- function(object = NULL, z = NULL, X = NULL,
+                            base_strata = NULL, refined_strata = NULL, abs = TRUE) {
+
+  if (is.null(object)) {
+    object <- strat(z = z, X = X, base_strata = base_strata, refined_strata = refined_strata)
+  }
+
+  if(!is.null(object$base_strata)) {
+    st <- object$base_strata
+    if(!is.factor(st)) {
+      st <- as.factor(st)
+    }
+    ust <- levels(st)
+    S <- length(ust)
+    base_smds <- matrix(NA, nrow = S, ncol = ncol(object$details$X_std),
+                        dimnames = list(ust, colnames(object$details$X_std)))
+    for (i in 1:S) {
+      if (sum(st == ust[i]) > 0) {
+        ist <- ust[i]
+        mean_t <- colMeans(object$details$X_std[st == ist & object$z == 1, , drop = FALSE])
+        mean_c <- colMeans(object$details$X_std[st == ist & object$z == 0, , drop = FALSE])
+        base_smds[i, ] <- mean_t-mean_c
+      } else {
+        base_smds[i, ] <- NA
+      }
+    }
+  } else {
+    base_smds <- NULL
+  }
+
+  if (!is.null(object$refined_strata)) {
+    st <- object$refined_strata
+    if(!is.factor(st)) {
+      st <- as.factor(st)
+    }
+    ust <- levels(st)
+    S <- length(ust)
+    refined_smds <- matrix(NA, nrow = S, ncol = ncol(object$details$X_std),
+                           dimnames = list(ust, colnames(object$details$X_std)))
+    for (i in 1:S) {
+      if (sum(st == ust[i], na.rm = TRUE) > 0) {
+        ist <- ust[i]
+        mean_t <- colMeans(object$details$X_std[st == ist & object$z == 1, , drop = FALSE])
+        mean_c <- colMeans(object$details$X_std[st == ist & object$z == 0, , drop = FALSE])
+        refined_smds[i, ] <- mean_t-mean_c
+      } else {
+        refined_smds[i, ] <- NA
+      }
+    }
+  } else {
+    refined_smds <- NULL
+  }
+
+  if (abs) {
+    if (!is.null(base_smds)) {
+      base_smds <- abs(base_smds)
+    }
+    if (!is.null(refined_smds)) {
+      refined_smds <- abs(refined_smds)
+    }
+  }
+
+  return(list("base" = base_smds, "refined" = refined_smds))
+}
diff --git a/R/data.R b/R/data.R
@@ -0,0 +1,51 @@
+#' Right Heart Catheterization Data
+#'
+#' The data in the example are from Frank Harrell's \code{Hmisc} package.
+#' The data there are very similar to the
+#' data in Connors et al. (1996), but do not exactly reproduce analyses from
+#' that article. So, we employ the version of that analysis in the
+#' documentation for Ruoqi Yu's \code{RBestMatch} package,
+#' which attempts to be close to the analysis in Connors et al.
+#' In Yu's version, the propensity score (her
+#' `pr`) is built using 76 covariates, and the focus of attention is on
+#' 26 "priority" covariates (her `X`) and the propensity score
+#' that were emphasized in the Connors et al. article, including those in that
+#' article's Table 3.
+#'
+#' @format Matrix with 5,735 rows and 28 columns:
+#' \describe{
+#'   \item{aps1}{APACHE score}
+#'   \item{surv2md1}{Support model estimate of the prob. of surviving 2 months}
+#'   \item{age}{Age}
+#'   \item{NumComorbid}{Number of comorbidities}
+#'   \item{adld3p_impute}{ADL with missing data imputed}
+#'   \item{adld3p_na}{ADL missing}
+#'   \item{das2d3pc}{DASI (Duke Activity Status Index)}
+#'   \item{temp1}{Temperature}
+#'   \item{hrt1}{Heart rate}
+#'   \item{meanbp1}{Mean blood pressure}
+#'   \item{resp1}{Respiratory rate}
+#'   \item{wblc1}{WBC}
+#'   \item{pafi1}{PaO2/FIO2 ratio}
+#'   \item{paco21}{PaCo2}
+#'   \item{ph1}{PH}
+#'   \item{crea1}{Creatinine}
+#'   \item{alb1}{Albumin}
+#'   \item{scoma1}{Glasgow Coma Score}
+#'   \item{cat1_copd}{Primary disease category COPD}
+#'   \item{cat1_mosfsep}{Primary disease category MOSF w sepsis}
+#'   \item{cat1_mosfmal}{Primary disease category MOSF w malignancy}
+#'   \item{cat1_chf}{Primary disease category CHF}
+#'   \item{cat1_coma}{Primary disease category coma}
+#'   \item{cat1_cirr}{Primary disease category cirrhosis}
+#'   \item{cat1_lung}{Primary disease category lung cancer}
+#'   \item{cat1_colon}{Primary disease category colon cancer}
+#'   \item{pr}{Propensity score using 76 covariates}
+#'   \item{z}{Treatment indicator}
+#' }
+#'
+#' @references Connors et al. (1996): The effectiveness of RHC in the initial
+#' care of critically ill patients. J American Medical Association 276:889-897.
+#' @references \url{https://hbiostat.org/data/}.
+
+"rhc_X"