version 1.0

cran · Jun 14, 2016 · 425ffff · 425ffff
commit 425ffff
Show file tree

Hide file tree

Showing 13 changed files with 608 additions and 0 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,19 @@
+Package: perccal
+Type: Package
+Title: Implementing Double Bootstrap Linear Regression Confidence
+        Intervals Using the 'perc-cal' Method
+Version: 1.0
+Date: 2016-06-14
+Authors@R: c( person("Daniel", "McCarthy", role=c("aut","cre","ctb"), 
+	            email="danielmc@wharton.upenn.edu"))
+Maintainer: Daniel McCarthy <danielmc@wharton.upenn.edu>
+URL: http://www.danielminhmccarthy.com
+Description: Contains functions which allow the user to compute confidence intervals quickly using the double bootstrap-based percentile calibrated ('perc-cal') method for linear regression coefficients. 'perccal_interval()' is the primary user-facing function within this package.
+License: GPL-3
+Imports: Rcpp (>= 0.11.5)
+LinkingTo: Rcpp, RcppArmadillo, RcppEigen
+NeedsCompilation: yes
+Packaged: 2016-06-14 17:32:40 UTC; danielmc
+Author: Daniel McCarthy [aut, cre, ctb]
+Repository: CRAN
+Date/Publication: 2016-06-14 20:34:32
diff --git a/MD5 b/MD5
@@ -0,0 +1,12 @@
+7137aa586945f221d985b0edff0f7123 *DESCRIPTION
+6ebd1c9a850db60126cfd6eea1bd0590 *NAMESPACE
+00986ec0fd6d81ef72f0ce7c715687b4 *R/RcppExports.R
+1c05abeb19c8d7d18e49e094e7fa3f34 *R/functions-code.R
+344a5a9282cf7f9e633a3514bb0ff603 *man/Cdboot_multi.Rd
+5923466962ecbb1553db9fbc28b0ee53 *man/Cquantile.Rd
+eab124c5da47cba00b908229d360fad6 *man/perccal-package.Rd
+d1c1e0a9d3ea030ca2bd0f1354c1c445 *man/perccal_interval.Rd
+1fda748862ea4c31684cf2e6a89bc93f *man/sample_rcpp.Rd
+899e256fbab14d837ea44d49b1c974b9 *src/Makevars
+fa318843a746835da902d47bc8eb2970 *src/RcppExports.cpp
+309ea63b5fe6fc65d4eea4484214fb06 *src/dboot_multi.cpp
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,6 @@
+useDynLib(perccal)
+importFrom(Rcpp, evalCpp)
+importFrom("stats", "coefficients")
+importFrom("stats", "quantile")
+importFrom("stats", "lm")
+exportPattern("^[[:alpha:]]+")
diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -0,0 +1,15 @@
+# This file was generated by Rcpp::compileAttributes
+# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
+
+sample_rcpp <- function(N, nsamp) {
+    .Call('perccal_sample_rcpp', PACKAGE = 'perccal', N, nsamp)
+}
+
+Cquantile <- function(xx, p) {
+    .Call('perccal_Cquantile', PACKAGE = 'perccal', xx, p)
+}
+
+Cdboot_multi <- function(xxyy, lgridlo, lgridhi, B, B2, G) {
+    .Call('perccal_Cdboot_multi', PACKAGE = 'perccal', xxyy, lgridlo, lgridhi, B, B2, G)
+}
+
diff --git a/R/functions-code.R b/R/functions-code.R
@@ -0,0 +1,158 @@
+##################################################
+### Percentile-Calibrated ('perc-cal') Confidence Interval Functions
+
+library(RcppEigen)
+library(Rcpp)
+library(RcppArmadillo)
+
+##' Calculates Percentile-Calibrated Linear Regression Confidence Intervals
+##'
+##' This is the main function of the package.  It takes as inputs the 
+##' predictor/response matrix appended together, which can be either
+##' a data frame or a matrix, along with the desired coverage and other
+##' settings, and outputs marginal confidence intervals for each of the 
+##' predictors, including the intercept.
+##' @name perccal_interval
+##' @param Xy [n by (p+1)] matrix: X in columns 1 to p, y in column p+1. 
+##' X is the design matrix, and is assumed to not include a vector of one's.
+##' @param alpha Target coverage desired.
+##' @param G Number of grid points to evaluate calibrated percentile method 
+##' on each side over.
+##' @param B Number of 1st stage bootstrap samples.
+##' @param B2 Number of 2nd stage double bootstrap samples.
+##' @return Return a (p+1)x2 matrix containing confidence intervals for all 
+##' regression coefficients, estimated via the perc-cal method.
+##' @examples 
+##' set.seed(1234)
+##' n = 32
+##' B = 500
+##' B2 = 500
+##' G=20
+##' x1=rnorm(n)
+##' x2=rnorm(n)
+##' eps=rnorm(n)
+##' y = x1 + 2*x2 + eps
+##' Xy = cbind(x1,x2,y)
+##' alpha = .025	
+##' perccal_interval(Xy, alpha, G, B, B2)
+perccal_interval  = function(Xy, alpha, G = 20, B = 999, B2 = 999){
+	# Inputs: 
+	# Xy: [n by (p+1)] matrix: X in columns 1 to p, y in column p+1 
+				#(please do not include the intercept)
+	# alpha: Target coverage desired.
+	# G    : # of grid points to evaluate calibrated percentile method on each side.
+	# B    : # of 1st stage bootstrap samples
+	# B2   : # of 2nd stage bootstrap samples
+	# Output: 
+	# Confidence interval for all regression coefficients, estimated via perc-cal method.
+
+	# Initialize all the matrices and vectors to be used below.
+	p = ncol(Xy)-1
+	theta.hat.boot = matrix(NA,nrow=p+1,ncol=B)
+	theta.hat.dboot = matrix(NA,nrow=p+1,ncol=B2)
+	theta.qtl.lgrid.lo = array(NA, dim=c(p+1,B,G))
+	theta.qtl.lgrid.hi = array(NA, dim=c(p+1,B,G))
+	perc.cal.ints = matrix(NA,nrow=p+1,ncol=2)
+
+	l.grid.lo = seq(.0001, alpha*1.4, length.out=G) 
+	l.grid.hi = 1-l.grid.lo
+
+	# Start main code
+	X=as.matrix(Xy[,1:p])
+	Xm = cbind(1,X)
+	y=Xy[,p+1]
+	regr_output = coefficients(summary(lm(y~X)))
+	prednames = rownames(regr_output)
+	theta.hat = as.numeric(regr_output[,"Estimate"])
+
+	boot_output = Cdboot_multi(as.matrix(Xy), l.grid.lo, l.grid.hi, B, B2, G)
+
+	for(k in 1:(p+1)){
+		theta.hat.boot	= boot_output$theta_hat_boot[,k]
+		theta.qtl.lgrid.lo = boot_output$theta_qtl_lgrid_lo[k,1][[1]][,,1]
+		theta.qtl.lgrid.hi = boot_output$theta_qtl_lgrid_hi[k,1][[1]][,,1]
+
+		# perc-cal method
+		log.lgrid.lo = theta.qtl.lgrid.lo < theta.hat[k]  
+		log.lgrid.hi = theta.qtl.lgrid.hi > theta.hat[k]        
+		percs.lo = 1-colSums(log.lgrid.lo)/B
+		percs.hi = 1-colSums(log.lgrid.hi)/B
+		lo.log = (percs.lo[1:(G-1)]<alpha)*(percs.lo[2:G]>=alpha)
+		hi.log = (percs.hi[1:(G-1)]<alpha)*(percs.hi[2:G]>=alpha)
+		if(sum(lo.log)>0){l.alpha.lo = l.grid.lo[lo.log*(1:(G-1))]}
+		if(sum(lo.log)<=0){l.alpha.lo = .0001}
+		if(sum(hi.log)>0){l.alpha.hi = l.grid.hi[hi.log*(1:(G-1))]}
+		if(sum(hi.log)<=0){l.alpha.hi = 1-.0001}
+		perc.cal.ints[k,] = quantile(theta.hat.boot,c(l.alpha.lo, l.alpha.hi))
+		}
+
+	colnames(perc.cal.ints) = c(paste("lower ",round(alpha,3),sep=""),paste("upper ",1-round(alpha,3),sep=""))
+	rownames(perc.cal.ints) = prednames
+	perc.cal.ints
+	}
+
+
+
+
+
+##' Fast computation of quantiles
+##'
+##' Helper function which takes as input a vector and obtains quantiles for it.
+##' Number of quantiles may be greater than one.
+##' @name Cquantile
+##' @param xx Numeric vector we are obtaining quantiles for.
+##' @param p Numeric vector of quantiles.
+##' @return Numeric vector containing quantiles, possibly greater than one.
+Cquantile <- function(xx, p) {
+    .Call('perccal_Cquantile', PACKAGE = 'perccal', xx, p)
+}
+
+
+##' Sample from [1,2,...,N] with replacement nsamp times in Rcpp.
+##'
+##' Helper function which samples from [1,2,...,N] with replacement 
+##' nsamp times in Rcpp.
+##' @name sample_rcpp
+##' @param N Largest integer to sample from.
+##' @param nsamp number of samples from [1,2,...,N] with replacement to obtain.
+##' @return samps nsamp-length vector of samples from [1,2,...,N] with replacement to obtain.
+sample_rcpp <- function(N, nsamp) {
+    .Call('perccal_sample_rcpp', PACKAGE = 'perccal', N, nsamp)
+}
+
+
+
+##' Fast computation of internal double bootstrap calculations
+##'
+##' This is the workhorse function of the package, speeding up computations
+##' within double bootstrap routine.
+##' @name Cdboot_multi
+##' @param xxyy (n by p+1) matrix for X (design matrix) and response vector y.
+##' @param lgridlo Lower quantile values of double bootstrap distribution to 
+##' obtain.
+##' @param lgridhi Upper quantile values of double bootstrap distribution to 
+##' obtain.
+##' @param B Number of 1st stage bootstrap samples.
+##' @param B2 Number of 2nd stage double bootstrap samples.
+##' @param G Calculate quantile-based empirical coverage at this many grid 
+##' points
+##' @return theta_hat_boot first-level bootstrap estimates of all slope coefficients
+##' @return theta_qtl_lgrid_lo (p+1 by B by G by 1) matrix for lower quantiles at 
+##' all grid points for all predictors over all bootstrap samples.
+##' @return theta_qtl_lgrid_hi (p+1 by B by G by 1) matrix for upper quantiles at 
+##' all grid points for all predictors over all bootstrap samples.
+Cdboot_multi <- function(xxyy, lgridlo, lgridhi, B, B2, G) {
+    .Call('perccal_Cdboot_multi', PACKAGE = 'perccal', xxyy, lgridlo, lgridhi, B, B2, G)
+}
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/man/Cdboot_multi.Rd b/man/Cdboot_multi.Rd
@@ -0,0 +1,38 @@
+% Generated by roxygen2 (4.1.1): do not edit by hand
+% Please edit documentation in R/functions-code.R
+\name{Cdboot_multi}
+\alias{Cdboot_multi}
+\title{Fast computation of internal double bootstrap calculations}
+\usage{
+Cdboot_multi(xxyy, lgridlo, lgridhi, B, B2, G)
+}
+\arguments{
+\item{xxyy}{(n by p+1) matrix for X (design matrix) and response vector y.}
+
+\item{lgridlo}{Lower quantile values of double bootstrap distribution to
+obtain.}
+
+\item{lgridhi}{Upper quantile values of double bootstrap distribution to
+obtain.}
+
+\item{B}{Number of 1st stage bootstrap samples.}
+
+\item{B2}{Number of 2nd stage double bootstrap samples.}
+
+\item{G}{Calculate quantile-based empirical coverage at this many grid
+points}
+}
+\value{
+theta_hat_boot first-level bootstrap estimates of all slope coefficients
+
+theta_qtl_lgrid_lo (p+1 by B by G by 1) matrix for lower quantiles at
+all grid points for all predictors over all bootstrap samples.
+
+theta_qtl_lgrid_hi (p+1 by B by G by 1) matrix for upper quantiles at
+all grid points for all predictors over all bootstrap samples.
+}
+\description{
+This is the workhorse function of the package, speeding up computations
+within double bootstrap routine.
+}
+
diff --git a/man/Cquantile.Rd b/man/Cquantile.Rd
@@ -0,0 +1,21 @@
+% Generated by roxygen2 (4.1.1): do not edit by hand
+% Please edit documentation in R/functions-code.R
+\name{Cquantile}
+\alias{Cquantile}
+\title{Fast computation of quantiles}
+\usage{
+Cquantile(xx, p)
+}
+\arguments{
+\item{xx}{Numeric vector we are obtaining quantiles for.}
+
+\item{p}{Numeric vector of quantiles.}
+}
+\value{
+Numeric vector containing quantiles, possibly greater than one.
+}
+\description{
+Helper function which takes as input a vector and obtains quantiles for it.
+Number of quantiles may be greater than one.
+}
+
diff --git a/man/perccal-package.Rd b/man/perccal-package.Rd
@@ -0,0 +1,49 @@
+\name{perccal-package}
+\alias{perccal-package}
+\alias{perccal}
+\docType{package}
+\title{
+Computing Confidence Intervals Via Double Bootstrap 'perc-cal' Method 
+}
+\description{
+
+Contains functions which allow the user to compute confidence intervals 
+quickly using the double bootstrap-based percentile calibrated 
+('perc-cal') method for linear regression coefficients. 
+
+}
+\details{
+  \tabular{ll}{
+    Package: \tab perccal\cr
+    Type: \tab Package\cr
+    Version: \tab 1.0\cr
+    Date: \tab 2016-06-14\cr
+    License: \tab GPL-3\cr
+    }
+
+	Contains functions which allow users to compute confidence intervals 
+	quickly using the double bootstrap-based percentile calibrated 
+	('perc-cal') method for linear regression coefficients. 
+
+	The help of Justin Bleich is strongly acknowledged.
+
+}
+\author{
+  Daniel McCarthy
+
+  Maintainer: Daniel McCarthy <danielmc@wharton.upenn.edu>
+}
+\references{
+
+Efron, Bradley; Tibshirani, Robert J. \dQuote{An Introduction to the Bootstrap}
+1994. Book. Publisher: CRC Press.
+
+McCarthy, Daniel; Zhang, Kai; Berk, Richard; Brown, Lawrence; Buja, Andreas;
+George, Edward; Zhao, Linda. \dQuote{Calibrated Percentile Double Bootstrap 
+For Robust Linear Regression Inference} 2016. Available on arXiv: 
+https://arxiv.org/abs/1511.00273
+
+}
+
+\keyword{ package }
+
diff --git a/man/perccal_interval.Rd b/man/perccal_interval.Rd
@@ -0,0 +1,47 @@
+% Generated by roxygen2 (4.1.1): do not edit by hand
+% Please edit documentation in R/functions-code.R
+\name{perccal_interval}
+\alias{perccal_interval}
+\title{Calculates Percentile-Calibrated Linear Regression Confidence Intervals}
+\usage{
+perccal_interval(Xy, alpha, G = 20, B = 999, B2 = 999)
+}
+\arguments{
+\item{Xy}{[n by (p+1)] matrix: X in columns 1 to p, y in column p+1.
+X is the design matrix, and is assumed to not include a vector of one's.}
+
+\item{alpha}{Target coverage desired.}
+
+\item{G}{Number of grid points to evaluate calibrated percentile method
+on each side over.}
+
+\item{B}{Number of 1st stage bootstrap samples.}
+
+\item{B2}{Number of 2nd stage double bootstrap samples.}
+}
+\value{
+Return a (p+1)x2 matrix containing confidence intervals for all
+regression coefficients, estimated via the perc-cal method.
+}
+\description{
+This is the main function of the package.  It takes as inputs the
+predictor/response matrix appended together, which can be either
+a data frame or a matrix, along with the desired coverage and other
+settings, and outputs marginal confidence intervals for each of the
+predictors, including the intercept.
+}
+\examples{
+set.seed(1234)
+n = 32
+B = 500
+B2 = 500
+G=20
+x1=rnorm(n)
+x2=rnorm(n)
+eps=rnorm(n)
+y = x1 + 2*x2 + eps
+Xy = cbind(x1,x2,y)
+alpha = .025
+perccal_interval(Xy, alpha, G, B, B2)
+}
+
diff --git a/man/sample_rcpp.Rd b/man/sample_rcpp.Rd
@@ -0,0 +1,21 @@
+% Generated by roxygen2 (4.1.1): do not edit by hand
+% Please edit documentation in R/functions-code.R
+\name{sample_rcpp}
+\alias{sample_rcpp}
+\title{Sample from [1,2,...,N] with replacement nsamp times in Rcpp.}
+\usage{
+sample_rcpp(N, nsamp)
+}
+\arguments{
+\item{N}{Largest integer to sample from.}
+
+\item{nsamp}{number of samples from [1,2,...,N] with replacement to obtain.}
+}
+\value{
+samps nsamp-length vector of samples from [1,2,...,N] with replacement to obtain.
+}
+\description{
+Helper function which samples from [1,2,...,N] with replacement
+nsamp times in Rcpp.
+}
+