version 1.0.0

cran · Mar 23, 2024 · 9b56813 · 9b56813
commit 9b56813
Show file tree

Hide file tree

Showing 58 changed files with 5,972 additions and 0 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,38 @@
+Package: RDHonest
+Title: Honest Inference in Regression Discontinuity Designs
+Version: 1.0.0
+Authors@R: 
+    c(person(given = "Michal",
+            family = "Kolesár",
+            role = c("aut", "cre", "cph"),
+            email = "kolesarmi@googlemail.com",
+            comment = c(ORCID = "0000-0002-2482-7796")),
+      person(given= "Tim",
+             family = "Armstrong",
+             role = "ctb",
+             email = "timothy.armstrong@usc.edu"))
+Description: Honest and nearly-optimal confidence intervals in fuzzy and sharp
+    regression discontinuity designs and for inference at a point based on local
+    linear regression. The implementation is based on Armstrong and Kolesár (2018)
+    <doi:10.3982/ECTA14434>, and Kolesár and Rothe (2018)
+    <doi:10.1257/aer.20160945>. Supports covariates, clustering, and weighting.
+Depends: R (>= 4.3.0)
+License: GPL-3
+Encoding: UTF-8
+LazyData: true
+Imports: stats, Formula, withr
+Suggests: spelling, ggplot2, testthat, knitr, rmarkdown, formatR
+Config/testthat/edition: 3
+Language: en-US
+URL: https://github.com/kolesarm/RDHonest
+BugReports: https://github.com/kolesarm/RDHonest/issues
+RoxygenNote: 7.3.1
+VignetteBuilder: knitr
+NeedsCompilation: no
+Packaged: 2024-03-22 00:10:30 UTC; kolesarm
+Author: Michal Kolesár [aut, cre, cph]
+    (<https://orcid.org/0000-0002-2482-7796>),
+  Tim Armstrong [ctb]
+Maintainer: Michal Kolesár <kolesarmi@googlemail.com>
+Repository: CRAN
+Date/Publication: 2024-03-22 20:00:02 UTC
diff --git a/MD5 b/MD5
@@ -0,0 +1,57 @@
+d2925759b43b6f25bf33220e954072a2 *DESCRIPTION
+b13372954d71b647d12b8606730d8c53 *NAMESPACE
+bd3c733f79f2e99c092844ecd2ad62c8 *NEWS.md
+015afbcdfffb9c5d6012b9d66cafda12 *R/Cbound.R
+a24e04f58c717f6c1c0a48305c919df3 *R/NPRfunctions.R
+b31d30125c5785c24442eb047da32ffd *R/RDHonest.R
+6ff45e367363e17b2c02c3691fd803d6 *R/RD_bme.R
+bb99dc6d13229fed5c9955a993d99104 *R/RD_opt.R
+6a41d0f0e0ab7d7d1607c706a3d40b1c *R/checks.R
+b03c1413bf83b493b028bc25c77172b7 *R/cvb.R
+f038ccf47cacdbe82ca04040f364c90b *R/documentation.R
+e890fdac65a0b1f139bb1229adbf49a8 *R/kernels.R
+d6701934e90df5b7cfcf9ddab81edcc0 *R/plots.R
+b4e942a0de189d436c015ec07a0fba28 *R/prelim_var.R
+2a6a0ad2ce8fdf41853ab58e14419b89 *R/sysdata.rda
+ff4c677d1e41a2fdc44263746a2a3de2 *R/utils.R
+7080893e02c49cd296d4424b9be55069 *build/partial.rdb
+5e0b3c3f604090d949b3a6c74a504f5b *build/vignette.rds
+6b3ec335cab7402aa786e91d6f9d6bfa *data/cghs.rda
+f0567c81b282df2910c4d7cd1e2dcf96 *data/headst.rda
+6838b6a056a61ff55a572c088cf00fd9 *data/lee08.rda
+8a8e79d9d25075f1ebad7329cc94f8ee *data/rcp.rda
+49125a52ec7640835ee69823066547e5 *data/rebp.rda
+a2c3f46c3ffee8040ff755211f87866a *inst/WORDLIST
+c42de70ef3bd200609204f1e3d0d908d *inst/doc/RDHonest.R
+7a5b45ae730db8773be4a30fb8ae05e3 *inst/doc/RDHonest.Rmd
+1f9a45641ed03d9659a916d9d3dc6168 *inst/doc/RDHonest.pdf
+85daf08fe9d4a72362964e1cabc6e27b *man/CVb.Rd
+8bf4b8167fde615012193992f7631b9a *man/RDHonest.Rd
+a53f6886201ccdbf6cb87147856ec1a1 *man/RDHonestBME.Rd
+ae60c195e1565df43c543981d91fe9cf *man/RDScatter.Rd
+f4a0db0b7307c86f9563fb58f99e9499 *man/RDSmoothnessBound.Rd
+843a0cb71fa8bac7e74c2b471a42258f *man/RDTEfficiencyBound.Rd
+fe719141f4c4e0a55af182867f9e1bcc *man/cghs.Rd
+eca050783faa7c37237742a877f5c167 *man/headst.Rd
+3b30d3b653d61da288e883466882c2d9 *man/lee08.Rd
+80303d476b9263f6b3d97671cad1fb95 *man/rcp.Rd
+56fcffd527d36a7acb4c257060738e23 *man/rebp.Rd
+cfd29e708c87a72ad33acf76570706ba *tests/spelling.R
+8a0c187fd1f5bc4d456349a41111eb80 *tests/testthat.R
+b5007d937f77bb59056b74a7b683bf3c *tests/testthat/test_cbound.R
+782beaff44c5394b665c532cd49e4918 *tests/testthat/test_clustering.R
+2eb40308400e1fc4c936aa830d64d9c7 *tests/testthat/test_covariates.R
+f43ad1d6d638150586538058e560264d *tests/testthat/test_cv.R
+d5b1f49e836e4cce16de56ebccc75d83 *tests/testthat/test_frd.R
+cda007da6de166093c5618e254342a8e *tests/testthat/test_interface.R
+a1097b9d88e33505ca51cc0fc477a1d3 *tests/testthat/test_kernels.R
+7d3cfc260fdbe50a3f0b9de88318afbb *tests/testthat/test_lpp.R
+79a1ef15cd2d987e94bdf8c1f12bf714 *tests/testthat/test_npr.R
+3c30605df610be90035c23645d2ea7fe *tests/testthat/test_rd.R
+a2f53e20545716f7bc3f32917aada1c5 *tests/testthat/test_weights.R
+7a5b45ae730db8773be4a30fb8ae05e3 *vignettes/RDHonest.Rmd
+7478e9ef79105fe2dee22f16146ac142 *vignettes/auto/RDHonest.el
+56563d7354840f54f2b5c9da89509755 *vignettes/auto/mk_Rpackage_template.el
+24bf16aa0bac0fd47f4c071c5b74a479 *vignettes/auto/np-testing-library.el
+df413b8a601ffb22f468a9237fdfdf42 *vignettes/np-testing-library.bib
+9bdf483a6bb9aec73d32a926b889030d *vignettes/vignette_head.tex
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,9 @@
+# Generated by roxygen2: do not edit by hand
+
+S3method(print,RDResults)
+export(CVb)
+export(RDHonest)
+export(RDHonestBME)
+export(RDScatter)
+export(RDSmoothnessBound)
+export(RDTEfficiencyBound)
diff --git a/NEWS.md b/NEWS.md
@@ -0,0 +1,21 @@
+# RDHonest 1.0.0
+
+## New Features
+
+- The function `RDHonest` computes estimates and confidence intervals for the
+  regression discontinuity (RD) parameter in sharp and fuzzy designs. It
+  supports covariates, clustering, and weighting. Confidence intervals are
+  honest (or bias-aware), with critical values computed using the `CVb`
+  function. Worst-case bias of the estimator is computed under either the Taylor
+  or Hölder smoothness class.
+- `RDHonestBME` computes confidence intervals in sharp RD designs with discrete
+  covariates under the assumption assumption that the conditional mean lies in
+  the "bounded misspecification error" class of functions, as considered in
+  [Kolesár and Rothe (2018)](https://doi.org/10.1257/aer.20160945).
+- Support for plotting the data is provided by the function `RDScatter`
+- The function `RDSmoothnessBound` computes a lower bound on the smoothness
+  constant `M`, used as a parameter by `RDHonest` to calculate the worst-case
+  bias of the estimator
+- The function `RDTEfficiencyBound` calculates efficiency of minimax one-sided
+  CIs at constant functions, or efficiency of two-sided fixed-length CIs at
+  constant functions under second-order Taylor smoothness class.
diff --git a/R/Cbound.R b/R/Cbound.R
@@ -0,0 +1,133 @@
+#' Lower bound on smoothness constant M in sharp RD designs
+#'
+#' Estimate a lower bound on the smoothness constant M and provide a lower
+#' confidence interval for it, using method described in supplement to
+#' Kolesár and Rothe (2018).
+#'
+#' @param object An object of class \code{"RDResults"}, typically a result of a
+#'     call to \code{\link{RDHonest}}.
+#' @param s Number of support points that curvature estimates should average
+#'     over.
+#' @param sclass Smoothness class, either \code{"T"} for Taylor or \code{"H"}
+#'     for Hölder class.
+#' @param alpha determines confidence level \code{1-alpha}.
+#' @param separate If \code{TRUE}, report estimates separately for data above
+#'     and below cutoff. If \code{FALSE}, report pooled estimates.
+#' @param multiple If \code{TRUE}, use multiple curvature estimates. If
+#'     \code{FALSE}, only use a single curvature estimate using observations
+#'     closest to the cutoff.
+#' @return Returns a data frame wit the following columns:
+#'
+#' \describe{
+
+#' \item{\code{estimate}}{Point estimate for lower bounds for M. }
+#'
+#' \item{\code{conf.low}}{Lower endpoint for a one-sided confidence interval
+#'                        for M}
+#'
+#' }
+#'
+#' The data frame has a single row if \code{separate==FALSE}; otherwise it has
+#' two rows, corresponding to smoothness bound estimates and confidence
+#' intervals below and above the cutoff, respectively.
+#' @references{
+#'
+#' \cite{Michal Kolesár and Christoph Rothe. Inference in regression
+#'       discontinuity designs with a discrete running variable.
+#'       American Economic Review, 108(8):2277—-2304,
+#'       August 2018. \doi{10.1257/aer.20160945}}
+#'
+#' }
+#' @examples
+#' ## Subset data to increase speed
+#' r <- RDHonest(log(earnings)~yearat14, data=cghs,
+#'               subset=abs(yearat14-1947)<10,
+#'               cutoff=1947, M=0.04, h=3)
+#' RDSmoothnessBound(r, s=2)
+#' @export
+RDSmoothnessBound <- function(object, s, separate=FALSE, multiple=TRUE,
+                              alpha=0.05, sclass="H") {
+    d <- PrelimVar(object$data, se.initial="EHW")
+
+    ## Curvature estimate based on jth set of three points closest to zero
+    Dk <- function(Y, X, xu, s2, j) {
+        I1 <- X >= xu[3*j*s-3*s+1]  & X <= xu[3*j*s-2*s]
+        I2 <- X >= xu[3*j*s-2*s+1]  & X <= xu[3*j*s-s]
+        I3 <- X >= xu[3*j*s-  s+1]  & X <= xu[3*j*s]
+
+        lam <- (mean(X[I3])-mean(X[I2])) / (mean(X[I3])-mean(X[I1]))
+        if  (sclass=="T") {
+            den <- (1-lam)*mean(X[I3]^2) + lam*mean(X[I1]^2) + mean(X[I2]^2)
+        } else {
+            den <- (1-lam)*mean(X[I3]^2) + lam*mean(X[I1]^2) - mean(X[I2]^2)
+        }
+        ## Delta is lower bound on M by lemma S2 in Kolesar and Rothe
+        Del <- 2 * (lam*mean(Y[I1]) + (1-lam) * mean(Y[I3])-mean(Y[I2])) / den
+        ## Variance of Delta
+        VD <- 4 * (lam^2*mean(s2[I1])/sum(I1) + (1-lam)^2*mean(s2[I3]) /
+                       sum(I3) + mean(s2[I2])/sum(I2)) / den^2
+        c(Del, sqrt(VD), mean(Y[I1]), mean(Y[I2]), mean(Y[I3]), range(X[I1]),
+          range(X[I2]), range(X[I3]))
+    }
+
+    xp <- unique(d$X[d$p])
+    xm <- sort(unique(abs(d$X[d$m])))
+
+    Dpj <- function(j) Dk(d$Y[d$p], d$X[d$p], xp, d$sigma2[d$p], j)
+    Dmj <- function(j) Dk(d$Y[d$m], abs(d$X[d$m]), xm, d$sigma2[d$m], j)
+
+    Sp <- floor(length(xp) / (3*s))
+    Sm <- floor(length(xm) / (3*s))
+    if (min(Sp, Sm) == 0)
+        stop("Value of s is too big")
+
+    if (!multiple) {
+        Sp <- Sm <- 1
+    }
+    Dp <- vapply(seq_len(Sp), Dpj, numeric(11))
+    Dm <- vapply(seq_len(Sm), Dmj, numeric(11))
+
+    ## Critical value
+    cv <- function(M, Z, sd, alpha) {
+        if (ncol(Z)==1) {
+            CVb(M/sd, alpha=alpha)
+        } else {
+            S <- Z+M*outer(rep(1, nrow(Z)), 1/sd)
+            maxS <- abs(S[cbind(seq_len(nrow(Z)), max.col(S))])
+            unname(stats::quantile(maxS, 1-alpha))
+        }
+    }
+
+    hatM <- function(D) {
+        ts <- abs(D[1, ]/D[2, ]) # sup_t statistic
+        maxt <- D[, which.max(ts)]
+
+        Z <- matrix(stats::rnorm(10000*ncol(D)), ncol=ncol(D))
+        ## median unbiased point estimate and lower CI
+        hatm <- lower <- 0
+        if (max(ts) > cv(0, Z, D[2, ], 1/2)) {
+            hatm <- FindZero(function(m) max(ts)-cv(m, Z, D[2, ], 1/2),
+                             negative=FALSE)
+        }
+        if (max(ts) >= cv(0, Z, D[2, ], alpha)) {
+            lower <- FindZero(function(m) max(ts)-cv(m, Z, D[2, ], alpha),
+                              negative=FALSE)
+        }
+        list(estimate=hatm, conf.low=lower,
+             diagnostics=c(Delta=maxt[1], sdDelta=maxt[2],
+                           y1=maxt[3], y2=maxt[4], y3=maxt[5],
+                           I1=maxt[6:7], I2=maxt[8:9], I3=maxt[10:11]))
+    }
+
+    if (separate) {
+        withr::with_seed(42, po <- hatM(Dp))
+        withr::with_seed(42, ne <- hatM(Dm))
+        ret <- data.frame(rbind("Below cutoff"=unlist(ne[1:2]),
+                                "Above cutoff"=unlist(po[1:2])))
+    } else {
+        ret <- withr::with_seed(42,
+                                data.frame((hatM(cbind(Dm, Dp))[1:2])))
+        rownames(ret) <- c("Pooled")
+    }
+    ret
+}
diff --git a/R/NPRfunctions.R b/R/NPRfunctions.R
@@ -0,0 +1,116 @@
+## Nonparametric Regression
+##
+## Calculate fuzzy or sharp RD estimate, or estimate of a conditional mean at a
+## point (depending on the class of \code{d}), and its variance using local
+## polynomial regression of order \code{order}.
+NPReg <- function(d, h, kern="triangular", order=1, se.method="nn", J=3) {
+    if (!is.function(kern))
+        kern <- EqKern(kern, boundary=FALSE, order=0)
+    ## Keep only positive kernel weights
+    X <- drop(d$X)
+    W <- if (h<=0) 0*X else kern(X/h)*d$w # kernel weights
+    ## if (!is.null(d$sigma2)) d$sigma2 <- as.matrix(d$sigma2)
+    Z <- outer(X, 0:order, "^")
+    nZ <- c("(Intercept)", colnames(d$X), paste0("I(", colnames(d$X), "^",
+                                                 seq_len(order), ")")[-1])
+    colnames(Z) <- nZ[seq_len(order+1)]
+    if (!inherits(d, "IP")) {
+        ZZ <- (X>=0)*Z
+        colnames(ZZ) <- c(paste0("I(", colnames(d$X), ">0)"),
+                          paste0(paste0("I(", colnames(d$X), ">0):"),
+                                 colnames(Z))[-1])
+        Z <- cbind(ZZ, Z, d$covs)
+    }
+    r0 <- stats::lm.wfit(x=Z, y=d$Y, w=W)
+    class(r0) <- c(if (ncol(d$Y)>1) "mlm", "lm")
+    if (any(is.na(r0$coefficients))) {
+        return(list(estimate=0, se=NA, est_w=W*0, sigma2=NA*d$Y, eff.obs=0,
+                    fs=NA, lm=r0))
+    }
+    wgt <- W*0
+    ok <- W!=0
+    wgt[ok] <- solve(qr.R(r0$qr), t(sqrt(W[W>0])*qr.Q(r0$qr)))[1, ]
+    ## To compute effective observations, rescale against uniform kernel
+    Wu <- d$w * (abs(X)<=h)
+    q_u <- qr(sqrt(Wu) * Z)
+    wgt_u <- solve(qr.R(q_u), t(sqrt(Wu) * qr.Q(q_u)))[1, ]
+    eff.obs <- sum(Wu)*sum(wgt_u^2/d$w)/sum(wgt^2/d$w)
+
+    ny <- NCOL(r0$residuals)
+    ## Squared residuals, allowing for multivariate Y
+    HC <- function(r) r[, rep(seq_len(ny), each=ny)] * r[, rep(seq_len(ny), ny)]
+
+    ## For RD, compute variance separately on either side of cutoff
+    ## TODO: better implement
+    NN <- function(X) {
+        res <- matrix(0, nrow=length(X), ncol=ny^2)
+        res[ok] <-
+            if (!inherits(d, "IP"))
+                rbind(as.matrix(sigmaNN(X[d$m & ok], d$Y[d$m & ok, ], J,
+                                        d$w[d$m & ok])),
+                      as.matrix(sigmaNN(X[d$p & ok], d$Y[d$p & ok, ], J,
+                                        d$w[d$p & ok])))
+            else
+                sigmaNN(X[ok], d$Y[ok, ], J, d$w[ok])
+        res
+    }
+
+    hsigma2 <- switch(se.method, nn=NN(X), EHW=HC(as.matrix(r0$residuals)),
+                      supplied.var=as.matrix(d$sigma2))
+
+    ## Variance
+    if (is.null(d$clusterid)) {
+        V <- colSums(as.matrix(wgt^2 * hsigma2))
+    } else if (se.method == "supplied.var") {
+        V <- colSums(as.matrix(wgt^2 * hsigma2))+
+            d$rho * (sum(tapply(wgt, d$clusterid, sum)^2)-sum(wgt^2))
+    } else {
+        us <- apply(as.matrix(wgt*r0$residuals)[ok, , drop=FALSE], 2,
+                    function(x) tapply(x, d$clusterid[ok], sum))
+        V <- as.vector(crossprod(us))
+    }
+    ret <- list(estimate=r0$coefficients[1], se=sqrt(V[1]), est_w=wgt,
+                sigma2=hsigma2, eff.obs=eff.obs, fs=NA, lm=r0)
+
+    if (inherits(d, "FRD")) {
+        ret$fs <- r0$coefficients[1, 2]
+        ret$estimate <- r0$coefficients[1, 1]/r0$coefficients[1, 2]
+        names(ret$estimate) <- colnames(r0$coefficients)[2]
+        ret$se <- sqrt(sum(c(1, -ret$estimate, -ret$estimate,
+                             ret$estimate^2) * V) / ret$fs^2)
+    }
+    ret
+}
+
+
+## Rule of thumb for choosing M
+##
+## Use global quartic regression to estimate a bound on the second derivative
+## for inference under under second order Hölder class. For RD, use a separate
+## regression on either side of the cutoff
+MROT <- function(d) {
+    if (inherits(d, "SRD")) {
+        max(MROT(data.frame(Y=d$Y[d$p], X=d$X[d$p], w=d$w[d$p])),
+            MROT(data.frame(Y=d$Y[d$m], X=d$X[d$m], w=d$w[d$m])))
+    } else if (inherits(d, "FRD")) {
+        c(MY=max(MROT(data.frame(Y=d$Y[d$p, 1], X=d$X[d$p], w=d$w[d$p])),
+                 MROT(data.frame(Y=d$Y[d$m, 1], X=d$X[d$m], w=d$w[d$m]))),
+          MD=max(MROT(data.frame(Y=d$Y[d$p, 2], X=d$X[d$p], w=d$w[d$p])),
+                 MROT(data.frame(Y=d$Y[d$m, 2], X=d$X[d$m], w=d$w[d$m]))))
+    } else {
+        ## STEP 1: Estimate global polynomial regression
+        r1 <- unname(stats::lm.wfit(y=d$Y, x=outer(drop(d$X), 0:4, "^"),
+                                    w=d$w)$coefficients)
+        if (length(unique(d$X))<4 || any(is.na(r1)))
+            stop(paste0("Insufficient unique values of the running",
+                        " variable to compute rule of thumb for M."))
+        f2 <- function(x) abs(2*r1[3]+6*x*r1[4]+12*x^2*r1[5])
+        ## maximum occurs either at endpoints, or else at the extremum,
+        ## -r1[4]/(4*r1[5]), if the extremum is in the support
+        f2e <- if (abs(r1[5])<=1e-10) Inf else -r1[4] / (4*r1[5])
+        M <- max(f2(min(d$X)), f2(max(d$X)))
+        if (min(d$X) < f2e && max(d$X) > f2e) M <- max(f2(f2e), M)
+
+        M
+    }
+}