Skip to content

Commit

Permalink
version 0.0.1
Browse files Browse the repository at this point in the history
  • Loading branch information
Harvey Klyne authored and cran-robot committed Sep 18, 2023
0 parents commit 73834a2
Show file tree
Hide file tree
Showing 47 changed files with 2,888 additions and 0 deletions.
21 changes: 21 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
Package: drape
Title: Doubly Robust Average Partial Effects
Version: 0.0.1
Authors@R: c(
person("Harvey", "Klyne", role = c("aut", "cre", "cph"), email = "hklyne@g.harvard.edu"))
Description: Doubly robust average partial effect estimation. This implementation contains methods for adding additional smoothness to plug-in regression procedures and for estimating score functions using smoothing splines. Details of the method can be found in Harvey Klyne and Rajen D. Shah (2023) <arXiv:2308.09207>.
License: MIT + file LICENSE
Encoding: UTF-8
BugReports: https://github.com/harveyklyne/drape/issues
Suggests: DoubleML, glmnet, graphics, hdi, knitr, Matrix, mlr3,
paradox, partykit, rjson, rmarkdown, testthat (>= 3.0.0),
xgboost
VignetteBuilder: knitr
Config/testthat/edition: 3
RoxygenNote: 7.1.2
NeedsCompilation: no
Packaged: 2023-09-17 00:36:38 UTC; Harvey
Author: Harvey Klyne [aut, cre, cph]
Maintainer: Harvey Klyne <hklyne@g.harvard.edu>
Repository: CRAN
Date/Publication: 2023-09-18 13:30:04 UTC
2 changes: 2 additions & 0 deletions LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
YEAR: 2023
COPYRIGHT HOLDER: drape authors
46 changes: 46 additions & 0 deletions MD5
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
279745416be714cba51dca4ed767255a *DESCRIPTION
833f66fc5437a9bfb6a440f0321a7bac *LICENSE
1875c5f9aad88810e4a8e6ebf8e47c69 *NAMESPACE
ec0019438fbd8a7effb88075153e5a73 *NEWS.md
1d9a29ee062e3d7cb7263f259adf8e29 *R/average_partial_effect.R
d2ccdab3f3958c865c23219560d67f00 *R/other_methods.R
3c195beb43319c7cd0a7a4341bc1aed8 *R/regression_functions.R
4550369a81a8bc4c0e38c663a3e8aee9 *R/resmooth.R
3a86cafbbde47f34f337335ee043d232 *R/score.R
7e8c75904a7730345ce58a30f5548b80 *R/simulation_settings.R
3204bb6b4e5c69af5fb0a80f00a8721d *README.md
731de9963fb32f455f89f48a43053693 *build/vignette.rds
0f2835c57f623fd7c44755d8bc72a72d *inst/doc/drape.R
e7dccb040c2078202ca075cdaef6c63c *inst/doc/drape.Rmd
4f57b1c556616e9b95ba4ed826aeb46c *inst/doc/drape.html
9ff4dfd7b52ef17fbc731d451a68954b *man/MC_sums.Rd
bc1e645361eef0debeefa38202388ee8 *man/basis_poly.Rd
74bf167f2265e75fa7ef72021624956a *man/compare.Rd
6a61cdd60c5b8b22c31467694ca5cea3 *man/compare_evaluate.Rd
6bd754eb69f0ca8df05b17348d186fd8 *man/compare_lm.Rd
996803563357e45c6f162d0432febc93 *man/compare_partially_linear.Rd
0f7738bb5d74bcc3aed7834f0f3508a0 *man/compare_rothenhausler.Rd
ebccdae8ba18ac48cdf97028592a713d *man/cv_resmooth.Rd
db6cddfb12784d55512d7bc90c94f828 *man/cv_spline_score.Rd
c28c754d75ef271763eb7f2d03aeffa7 *man/drape.Rd
5c43ee3f2909633a269ef48d63bb5ebb *man/fit_lasso_poly.Rd
3ff0540aab203589c3d153ad5cd63523 *man/fit_xgboost.Rd
998f1ef1a21d301940bd41ea3048c1b5 *man/mixture_score.Rd
f4690811424df26f5a1050307077afd2 *man/new_X.Rd
54d09c567ca1cf2086a211ae3b0bb9f6 *man/ng_pseudo_response.Rd
b6cbad85983066ac042b8c8418a0e779 *man/partially_linear.Rd
1e84ba25c5578081923de7cadf4f479b *man/resmooth.Rd
796c8a4549c9275060a98449c69bdf23 *man/rmixture.Rd
bda62e8e65f1c0f41a33cdc6441fedf9 *man/rothenhausler_basis.Rd
83c375fae1383784a5b720f4199c5246 *man/rothenhausler_yu.Rd
a7665e0b1a0ddfc0a64f6a9dfc75c9c7 *man/simulate_data.Rd
decdc0cad86696290903f3bca2c2075f *man/sort_bin.Rd
4a18660032e8b7affe736715089bc87d *man/spline_score.Rd
562211694a4908da8496def73b498c17 *man/z_correlated_normal.Rd
c8a3c33282b990b98157faa30ae2de8f *tests/testthat.R
4e24568fe403fe249e261ff6eb6865ca *tests/testthat/test_average_partial_effect.R
e7ce5a471977b9ae1885b95b5a320605 *tests/testthat/test_regression_functions.R
39585d5403ab73201616833e88f8716b *tests/testthat/test_resmooth.R
f403ff692a9a199730e6bfa0670ea6d6 *tests/testthat/test_score.R
043060cb8531a08de0f0d102ae1e8d95 *tests/testthat/test_simulation_settings.R
e7dccb040c2078202ca075cdaef6c63c *vignettes/drape.Rmd
10 changes: 10 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Generated by roxygen2: do not edit by hand

export(basis_poly)
export(cv_resmooth)
export(cv_spline_score)
export(drape)
export(ng_pseudo_response)
export(resmooth)
export(simulate_data)
export(spline_score)
3 changes: 3 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# drape 0.0.1

* Added a `NEWS.md` file to track changes to the package.
239 changes: 239 additions & 0 deletions R/average_partial_effect.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,239 @@
drape <- function(y, x, z,
response_regression,
predictor_regression,
resmooth_bw = NULL,
spline_df = NULL,
nfolds = 5L,
foldid = NULL,
verbose = FALSE) {
#' Estimate the doubly-robust average partial effect estimate of X on Y, in the presence of Z.
#'
#' @param y vector of responses.
#' @param x vector of the predictor of interest.
#' @param z matrix of additional predictors.
#' @param response_regression function which takes input data of the form
#' (X,y), where X=cbind(x,z), and returns a prediction function f:X -> y and optionally a
#' similar derivative estimation function (in this case no resmoothing is done).
#' @param predictor_regression function which takes input data of the form
#' (z,x), and returns a prediction function m:z -> x.
#' @param resmooth_bw optional numeric to be used as resmoothing bandwidth,
#' otherwise chosen via cross-validation. Only used if
#' response_regression doesn't predict derivatives.
#' @param spline_df optional double, a smoothing parameter for the
#' unconditional spline score estimator, corresponding to the effective degrees
#' of freedom for a smoothing spline. If NULL, chosen via
#' cross-validation.
#' @param nfolds integer, number of sample-splits. If set to
#' one, then all data is used for both training and evaluation.
#' @param foldid optional vector with components in 1:nfolds indicating the folds in which
#' each observation fell. Overwrites nfolds.
#' @param verbose boolean controlling level of information outputted.
#'
#' @return list containing the average partial effect estimate and the
#' corresponding standard error estimate. If verbose=TRUE, additionally contains
#' variables used in computations.
#'
#' @export
#'
#' @examples
#' set.seed(0)
#' data <- simulate_data(200, "normal", "plm")
#' response_regression <- function(X,y){
#' df <- data.frame(y,X)
#' colnames(df) <- c("y", paste0("X", 1:10))
#' lm1 <- stats::lm(y~X1+sin(X2), data=df)
#' fit <- function(newX){
#' newdf <- data.frame(newX)
#' colnames(newdf) <- paste0("X", 1:10)
#' return(as.vector(stats::predict(lm1, newdata=newdf)))}
#' return(list("fit"=fit))
#' }
#' predictor_regression <- function(z,x){
#' df <- data.frame(x,z)
#' colnames(df) <- c("x", paste0("Z", 1:9))
#' lm1 <- stats::lm(x~Z1+Z2, data=df)
#' fit <- function(newz){
#' newdf <- data.frame(newz)
#' colnames(newdf) <- paste0("Z", 1:9)
#' return(as.vector(stats::predict(lm1, newdata=newdf)))}
#' return(list("fit"=fit))
#' }
#' drape(data$y, data$x, data$z, response_regression, predictor_regression, nfolds=2)

if(!is.vector(y)){
warning("y must be a vector. Updating y <- as.vector(y).")
y <- as.vector(y)
}

# Converting (x,z) to matrix form for compatibility.
X <- cbind(x,z)
d <- 1

n <- length(y)

if(dim(X)[1]!=n){stop("Must have length(y) == dim(X)[1].")}

if (is.null(foldid)){
foldid <- sample(rep(seq(nfolds), length.out=n))
} else{ nfolds <- max(foldid) }

if(nfolds < 1){
warning("nfolds must be a positive integer. Setting nfolds=1.")
nfolds <- 1L
}
if(round(nfolds)!=nfolds){
warning("nfolds must be an integer. Rounding.")
nfolds <- round(nfolds)
}
foldid <- sample(rep(seq(nfolds), length.out=n))

if (verbose){out_all <- list("folds" = foldid)}


# Train regression procedures

regression_list <- list()
predictor_list <- list()

for (fold in seq(nfolds)){

te <- (foldid == fold)
if (nfolds > 1){tr <- !te} else{tr <- te}
train <- list("X" = matrix(X[tr,], ncol=ncol(X)), "y" = y[tr])

regression_list[[fold]] <- response_regression(train$X, train$y)
predictor_list[[fold]] <- predictor_regression(train$X[,-d], train$X[,d])

}

# See if regression method produces usable derivative estimates on all folds
do_resmooth <- !all(sapply(regression_list, function(x){"deriv_fit" %in% names(x)}))

# Choose resmoothing bandwidth
if (do_resmooth & is.null(resmooth_bw)){
cv_resmooth_output <- cv_resmooth(X=X,
y=y,
d=1,
regression=regression_list,
tol=2,
prefit=TRUE,
foldid=foldid)
resmooth_bw <- cv_resmooth_output$bw_opt[1]
if (verbose) {out_all["cv_resmooth_output"] = list(cv_resmooth_output)}
}

# Choose spline df (using in-sample residuals from whole data set)
if (is.null(spline_df)){
m_fit <- predictor_regression(X[,-d], X[,d])$fit

# Variance estimation
resid <- X[,d] - m_fit(X[,-d])
tree <- partykit::ctree(resid^2 ~ .,
data=data.frame("resid"=resid, "z"=X[,-d]))
sigma <- sqrt(unname(stats::predict(tree,
newdata = data.frame("z"=X[,-d]))))
eps <- resid / sigma
cv_spline_score_output <- cv_spline_score(x=eps)
spline_df <- cv_spline_score_output$df_1se # Or df_min

if (verbose) {out_all["cv_spline_score_output"] = list(cv_spline_score_output)}
}

# Evaluate f, df, and score
f_pred <- rep(NA, n)
df_pred <- rep(NA, n)
score_pred <- rep(NA, n)
for (fold in seq(nfolds)){

if (verbose) {out_fold = list()}

te <- (foldid == fold)
if (nfolds > 1){tr <- !te} else{tr <- te}
train <- list("X" = matrix(X[tr,], ncol=ncol(X)), "y" = y[tr])
test <- list("X" = matrix(X[te,], ncol=ncol(X)), "y" = y[te])


# # # f and df evaluation
f_fit <- regression_list[[fold]]$fit

if ("deriv_fit" %in% names(regression_list[[fold]])) {
f_pred[te] <- f_fit(test$X)
df_fit <- regression_list[[fold]]$deriv_fit
df_pred[te] <- df_fit(test$X)
} else{
smooth_list <- resmooth(fit = f_fit,
X = test$X,
d = d,
bw = resmooth_bw)
if (verbose){out_fold["smooth_list"] <- list(smooth_list)}
f_pred[te] <- smooth_list$pred[[1]]
df_pred[te] <- smooth_list$deriv[[1]]
}

# # # Score evaluation
m_fit <- predictor_list[[fold]]$fit

# Variance estimation
resid_train <- train$X[,d] - m_fit(train$X[,-d])
tree <- partykit::ctree(resid_train^2 ~ .,
data=data.frame("resid_train"=resid_train,
"z"=train$X[,-d]))
if (verbose){out_fold["tree"] <- list(tree)}
sigma_train <- sqrt(unname(stats::predict(tree,
newdata = data.frame("z"=train$X[,-d]))))

resid_test <- test$X[,1] - m_fit(test$X[,-1])
sigma_test <- sqrt(unname(stats::predict(tree,
newdata = data.frame("z"=test$X[,-d]))))

# If any variance estimates are close to zero, assume homogeneous (more stable)
if (any(sigma_test < 0.01)){
warning(paste0("Minimal heterogeneous variance equals ",
round(min(sigma_test),4), ", reducing to homogeneous
case for stability."))
sigma_train <- rep(sqrt(mean(resid_train^2)), length(train$y))
sigma_test <- rep(sqrt(mean(resid_test^2)), length(test$y))
}

# Score estimation
eps_train <- resid_train / sigma_train
rho <- spline_score(eps_train, df=spline_df)$rho
if (verbose){out_fold["rho"] <- list(rho)}

eps_test <- resid_test / sigma_test
rho_test <- rho(eps_test)

# If any score estimates are large, assume Gaussian (more stable)
if (any(abs(rho_test) > 10)){
warning(paste0("Maximal absolute spline score prediction equals ",
round(max(abs(rho_test)),2), ", reducing to Gaussian
case for stability."))
rho_test <- - eps_test
}

score_pred[te] <- rho_test/sigma_test # rho_P(x,z)=1/sigma(z) * rho_\varepsilon( (x-m(z))/sigma(z) )

if (verbose){
str <- paste0("fold", fold)
out_all[[str]] <- out_fold
}

} # End of fold

evaluations <- df_pred - score_pred * (y - f_pred)
est <- mean(evaluations)
se <- stats::sd(evaluations) / sqrt(n)

out <- list("est" = est,
"se" = se)

if (verbose) {
out[["computations"]] <- out_all
out[["f_pred"]] <- f_pred
out[["df_deriv"]] <- df_pred
out[["score_pred"]] <- score_pred
out[["evaluations"]] <- evaluations
}

return(out)
}

0 comments on commit 73834a2

Please sign in to comment.