version 0.0.1

cran · Sep 18, 2023 · 73834a2 · 73834a2
commit 73834a2
Show file tree

Hide file tree

Showing 47 changed files with 2,888 additions and 0 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,21 @@
+Package: drape
+Title: Doubly Robust Average Partial Effects
+Version: 0.0.1
+Authors@R: c(
+    person("Harvey", "Klyne", role = c("aut", "cre", "cph"), email = "hklyne@g.harvard.edu"))
+Description: Doubly robust average partial effect estimation. This implementation contains methods for adding additional smoothness to plug-in regression procedures and for estimating score functions using smoothing splines. Details of the method can be found in Harvey Klyne and Rajen D. Shah (2023) <arXiv:2308.09207>.
+License: MIT + file LICENSE
+Encoding: UTF-8
+BugReports: https://github.com/harveyklyne/drape/issues
+Suggests: DoubleML, glmnet, graphics, hdi, knitr, Matrix, mlr3,
+        paradox, partykit, rjson, rmarkdown, testthat (>= 3.0.0),
+        xgboost
+VignetteBuilder: knitr
+Config/testthat/edition: 3
+RoxygenNote: 7.1.2
+NeedsCompilation: no
+Packaged: 2023-09-17 00:36:38 UTC; Harvey
+Author: Harvey Klyne [aut, cre, cph]
+Maintainer: Harvey Klyne <hklyne@g.harvard.edu>
+Repository: CRAN
+Date/Publication: 2023-09-18 13:30:04 UTC
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,2 @@
+YEAR: 2023
+COPYRIGHT HOLDER: drape authors
diff --git a/MD5 b/MD5
@@ -0,0 +1,46 @@
+279745416be714cba51dca4ed767255a *DESCRIPTION
+833f66fc5437a9bfb6a440f0321a7bac *LICENSE
+1875c5f9aad88810e4a8e6ebf8e47c69 *NAMESPACE
+ec0019438fbd8a7effb88075153e5a73 *NEWS.md
+1d9a29ee062e3d7cb7263f259adf8e29 *R/average_partial_effect.R
+d2ccdab3f3958c865c23219560d67f00 *R/other_methods.R
+3c195beb43319c7cd0a7a4341bc1aed8 *R/regression_functions.R
+4550369a81a8bc4c0e38c663a3e8aee9 *R/resmooth.R
+3a86cafbbde47f34f337335ee043d232 *R/score.R
+7e8c75904a7730345ce58a30f5548b80 *R/simulation_settings.R
+3204bb6b4e5c69af5fb0a80f00a8721d *README.md
+731de9963fb32f455f89f48a43053693 *build/vignette.rds
+0f2835c57f623fd7c44755d8bc72a72d *inst/doc/drape.R
+e7dccb040c2078202ca075cdaef6c63c *inst/doc/drape.Rmd
+4f57b1c556616e9b95ba4ed826aeb46c *inst/doc/drape.html
+9ff4dfd7b52ef17fbc731d451a68954b *man/MC_sums.Rd
+bc1e645361eef0debeefa38202388ee8 *man/basis_poly.Rd
+74bf167f2265e75fa7ef72021624956a *man/compare.Rd
+6a61cdd60c5b8b22c31467694ca5cea3 *man/compare_evaluate.Rd
+6bd754eb69f0ca8df05b17348d186fd8 *man/compare_lm.Rd
+996803563357e45c6f162d0432febc93 *man/compare_partially_linear.Rd
+0f7738bb5d74bcc3aed7834f0f3508a0 *man/compare_rothenhausler.Rd
+ebccdae8ba18ac48cdf97028592a713d *man/cv_resmooth.Rd
+db6cddfb12784d55512d7bc90c94f828 *man/cv_spline_score.Rd
+c28c754d75ef271763eb7f2d03aeffa7 *man/drape.Rd
+5c43ee3f2909633a269ef48d63bb5ebb *man/fit_lasso_poly.Rd
+3ff0540aab203589c3d153ad5cd63523 *man/fit_xgboost.Rd
+998f1ef1a21d301940bd41ea3048c1b5 *man/mixture_score.Rd
+f4690811424df26f5a1050307077afd2 *man/new_X.Rd
+54d09c567ca1cf2086a211ae3b0bb9f6 *man/ng_pseudo_response.Rd
+b6cbad85983066ac042b8c8418a0e779 *man/partially_linear.Rd
+1e84ba25c5578081923de7cadf4f479b *man/resmooth.Rd
+796c8a4549c9275060a98449c69bdf23 *man/rmixture.Rd
+bda62e8e65f1c0f41a33cdc6441fedf9 *man/rothenhausler_basis.Rd
+83c375fae1383784a5b720f4199c5246 *man/rothenhausler_yu.Rd
+a7665e0b1a0ddfc0a64f6a9dfc75c9c7 *man/simulate_data.Rd
+decdc0cad86696290903f3bca2c2075f *man/sort_bin.Rd
+4a18660032e8b7affe736715089bc87d *man/spline_score.Rd
+562211694a4908da8496def73b498c17 *man/z_correlated_normal.Rd
+c8a3c33282b990b98157faa30ae2de8f *tests/testthat.R
+4e24568fe403fe249e261ff6eb6865ca *tests/testthat/test_average_partial_effect.R
+e7ce5a471977b9ae1885b95b5a320605 *tests/testthat/test_regression_functions.R
+39585d5403ab73201616833e88f8716b *tests/testthat/test_resmooth.R
+f403ff692a9a199730e6bfa0670ea6d6 *tests/testthat/test_score.R
+043060cb8531a08de0f0d102ae1e8d95 *tests/testthat/test_simulation_settings.R
+e7dccb040c2078202ca075cdaef6c63c *vignettes/drape.Rmd
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,10 @@
+# Generated by roxygen2: do not edit by hand
+
+export(basis_poly)
+export(cv_resmooth)
+export(cv_spline_score)
+export(drape)
+export(ng_pseudo_response)
+export(resmooth)
+export(simulate_data)
+export(spline_score)
diff --git a/NEWS.md b/NEWS.md
@@ -0,0 +1,3 @@
+# drape 0.0.1
+
+* Added a `NEWS.md` file to track changes to the package.
diff --git a/R/average_partial_effect.R b/R/average_partial_effect.R
@@ -0,0 +1,239 @@
+drape <- function(y, x, z,
+                  response_regression,
+                  predictor_regression,
+                  resmooth_bw = NULL,
+                  spline_df = NULL,
+                  nfolds = 5L,
+                  foldid = NULL,
+                  verbose = FALSE) {
+  #' Estimate the doubly-robust average partial effect estimate of X on Y, in the presence of Z.
+  #'
+  #' @param y vector of responses.
+  #' @param x vector of the predictor of interest.
+  #' @param z matrix of additional predictors.
+  #' @param response_regression function which takes input data of the form
+  #'     (X,y), where X=cbind(x,z), and returns a prediction function f:X -> y and optionally a
+  #'     similar derivative estimation function (in this case no resmoothing is done).
+  #' @param predictor_regression function which takes input data of the form
+  #'     (z,x), and returns a prediction function m:z -> x.
+  #' @param resmooth_bw optional numeric to be used as resmoothing bandwidth,
+  #'     otherwise chosen via cross-validation. Only used if
+  #'     response_regression doesn't predict derivatives.
+  #' @param spline_df optional double, a smoothing parameter for the
+  #'     unconditional spline score estimator, corresponding to the effective degrees
+  #'     of freedom for a smoothing spline. If NULL, chosen via
+  #'     cross-validation.
+  #' @param nfolds integer, number of sample-splits. If set to
+  #'     one, then all data is used for both training and evaluation.
+  #' @param foldid optional vector with components in 1:nfolds indicating the folds in which
+  #'      each observation fell. Overwrites nfolds.
+  #' @param verbose boolean controlling level of information outputted.
+  #'
+  #' @return list containing the average partial effect estimate and the
+  #'  corresponding standard error estimate. If verbose=TRUE, additionally contains
+  #'  variables used in computations.
+  #'
+  #' @export
+  #'
+  #' @examples
+  #' set.seed(0)
+  #' data <- simulate_data(200, "normal", "plm")
+  #' response_regression <- function(X,y){
+  #'     df <- data.frame(y,X)
+  #'     colnames(df) <- c("y", paste0("X", 1:10))
+  #'     lm1 <- stats::lm(y~X1+sin(X2), data=df)
+  #'     fit <- function(newX){
+  #'         newdf <- data.frame(newX)
+  #'         colnames(newdf) <- paste0("X", 1:10)
+  #'         return(as.vector(stats::predict(lm1, newdata=newdf)))}
+  #'     return(list("fit"=fit))
+  #' }
+  #' predictor_regression <- function(z,x){
+  #'     df <- data.frame(x,z)
+  #'     colnames(df) <- c("x", paste0("Z", 1:9))
+  #'     lm1 <- stats::lm(x~Z1+Z2, data=df)
+  #'     fit <- function(newz){
+  #'         newdf <- data.frame(newz)
+  #'         colnames(newdf) <- paste0("Z", 1:9)
+  #'         return(as.vector(stats::predict(lm1, newdata=newdf)))}
+  #'     return(list("fit"=fit))
+  #' }
+  #' drape(data$y, data$x, data$z, response_regression, predictor_regression, nfolds=2)
+
+  if(!is.vector(y)){
+    warning("y must be a vector. Updating y <- as.vector(y).")
+    y <- as.vector(y)
+  }
+
+  # Converting (x,z) to matrix form for compatibility.
+  X <- cbind(x,z)
+  d <- 1
+
+  n <- length(y)
+
+  if(dim(X)[1]!=n){stop("Must have length(y) == dim(X)[1].")}
+
+  if (is.null(foldid)){
+    foldid <- sample(rep(seq(nfolds), length.out=n))
+  } else{ nfolds <- max(foldid) }
+
+  if(nfolds < 1){
+    warning("nfolds must be a positive integer. Setting nfolds=1.")
+    nfolds <- 1L
+  }
+  if(round(nfolds)!=nfolds){
+    warning("nfolds must be an integer. Rounding.")
+    nfolds <- round(nfolds)
+  }
+  foldid <- sample(rep(seq(nfolds), length.out=n))
+
+  if (verbose){out_all <- list("folds" = foldid)}
+
+
+  # Train regression procedures
+
+  regression_list <- list()
+  predictor_list <- list()
+
+  for (fold in seq(nfolds)){
+
+    te <- (foldid == fold)
+    if (nfolds > 1){tr <- !te} else{tr <- te}
+    train <- list("X" = matrix(X[tr,], ncol=ncol(X)), "y" = y[tr])
+
+    regression_list[[fold]] <- response_regression(train$X, train$y)
+    predictor_list[[fold]] <- predictor_regression(train$X[,-d], train$X[,d])
+
+  }
+
+  # See if regression method produces usable derivative estimates on all folds
+  do_resmooth <- !all(sapply(regression_list, function(x){"deriv_fit" %in% names(x)}))
+
+  # Choose resmoothing bandwidth
+  if (do_resmooth & is.null(resmooth_bw)){
+    cv_resmooth_output <- cv_resmooth(X=X,
+                                      y=y,
+                                      d=1,
+                                      regression=regression_list,
+                                      tol=2,
+                                      prefit=TRUE,
+                                      foldid=foldid)
+    resmooth_bw <- cv_resmooth_output$bw_opt[1]
+    if (verbose) {out_all["cv_resmooth_output"] = list(cv_resmooth_output)}
+  }
+
+  # Choose spline df (using in-sample residuals from whole data set)
+  if (is.null(spline_df)){
+    m_fit <- predictor_regression(X[,-d], X[,d])$fit
+
+    # Variance estimation
+    resid <- X[,d] - m_fit(X[,-d])
+    tree <- partykit::ctree(resid^2 ~ .,
+                            data=data.frame("resid"=resid, "z"=X[,-d]))
+    sigma <- sqrt(unname(stats::predict(tree,
+                                        newdata = data.frame("z"=X[,-d]))))
+    eps <- resid / sigma
+    cv_spline_score_output <- cv_spline_score(x=eps)
+    spline_df <- cv_spline_score_output$df_1se # Or df_min
+
+    if (verbose) {out_all["cv_spline_score_output"] = list(cv_spline_score_output)}
+  }
+
+  # Evaluate f, df, and score
+  f_pred <- rep(NA, n)
+  df_pred <- rep(NA, n)
+  score_pred <- rep(NA, n)
+  for (fold in seq(nfolds)){
+
+    if (verbose) {out_fold = list()}
+
+    te <- (foldid == fold)
+    if (nfolds > 1){tr <- !te} else{tr <- te}
+    train <- list("X" = matrix(X[tr,], ncol=ncol(X)), "y" = y[tr])
+    test <- list("X" = matrix(X[te,], ncol=ncol(X)), "y" = y[te])
+
+
+    # # # f and df evaluation
+    f_fit <- regression_list[[fold]]$fit
+
+    if ("deriv_fit" %in% names(regression_list[[fold]])) {
+      f_pred[te] <- f_fit(test$X)
+      df_fit <- regression_list[[fold]]$deriv_fit
+      df_pred[te] <- df_fit(test$X)
+    }  else{
+      smooth_list <- resmooth(fit = f_fit,
+                              X = test$X,
+                              d = d,
+                              bw = resmooth_bw)
+      if (verbose){out_fold["smooth_list"] <- list(smooth_list)}
+      f_pred[te] <- smooth_list$pred[[1]]
+      df_pred[te] <- smooth_list$deriv[[1]]
+    }
+
+    # # # Score evaluation
+    m_fit <- predictor_list[[fold]]$fit
+
+    # Variance estimation
+    resid_train <- train$X[,d] - m_fit(train$X[,-d])
+    tree <- partykit::ctree(resid_train^2 ~ .,
+                            data=data.frame("resid_train"=resid_train,
+                                            "z"=train$X[,-d]))
+    if (verbose){out_fold["tree"] <- list(tree)}
+    sigma_train <- sqrt(unname(stats::predict(tree,
+                                              newdata = data.frame("z"=train$X[,-d]))))
+
+    resid_test <- test$X[,1] - m_fit(test$X[,-1])
+    sigma_test <- sqrt(unname(stats::predict(tree,
+                                             newdata = data.frame("z"=test$X[,-d]))))
+
+    # If any variance estimates are close to zero, assume homogeneous (more stable)
+    if (any(sigma_test < 0.01)){
+      warning(paste0("Minimal heterogeneous variance equals ",
+                     round(min(sigma_test),4), ", reducing to homogeneous
+                   case for stability."))
+      sigma_train <- rep(sqrt(mean(resid_train^2)), length(train$y))
+      sigma_test <- rep(sqrt(mean(resid_test^2)), length(test$y))
+    }
+
+    # Score estimation
+    eps_train <- resid_train / sigma_train
+    rho <- spline_score(eps_train, df=spline_df)$rho
+    if (verbose){out_fold["rho"] <- list(rho)}
+
+    eps_test <- resid_test / sigma_test
+    rho_test <- rho(eps_test)
+
+    # If any score estimates are large, assume Gaussian (more stable)
+    if (any(abs(rho_test) > 10)){
+      warning(paste0("Maximal absolute spline score prediction equals ",
+                     round(max(abs(rho_test)),2), ", reducing to Gaussian
+                   case for stability."))
+      rho_test <- - eps_test
+    }
+
+    score_pred[te] <- rho_test/sigma_test # rho_P(x,z)=1/sigma(z) * rho_\varepsilon( (x-m(z))/sigma(z) )
+
+    if (verbose){
+      str <- paste0("fold", fold)
+      out_all[[str]] <- out_fold
+    }
+
+  } # End of fold
+
+  evaluations <- df_pred - score_pred * (y - f_pred)
+  est <- mean(evaluations)
+  se <- stats::sd(evaluations) / sqrt(n)
+
+  out <- list("est" = est,
+              "se" = se)
+
+  if (verbose) {
+    out[["computations"]] <- out_all
+    out[["f_pred"]] <- f_pred
+    out[["df_deriv"]] <- df_pred
+    out[["score_pred"]] <- score_pred
+    out[["evaluations"]] <- evaluations
+  }
+
+  return(out)
+}