version 1.0-0

cran · Oct 3, 2022 · 043cb2f · 043cb2f
commit 043cb2f
Show file tree

Hide file tree

Showing 11 changed files with 1,971 additions and 0 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,20 @@
+Package: sbde
+Version: 1.0-0
+Date: 2022-09-29
+Title: Semiparametric Bayesian Density Estimation
+Author: Surya Tokdar <surya.tokdar@duke.edu>
+Maintainer: Surya Tokdar <surya.tokdar@duke.edu>
+Depends: R (>= 2.6)
+Imports: coda, extremefit
+Description: Offers Bayesian semiparametric density estimation 
+             and tail-index estimation for heavy tailed data, by
+             using a parametric, tail-respecting transformation
+             of the data to the unit interval and then modeling
+             the transformed data with a purely nonparametric
+             logistic Gaussian process density prior. Based on 
+             Tokdar et al. (2022) <doi:10.1080/01621459.2022.2104727>.
+License: GPL-2
+NeedsCompilation: yes
+Packaged: 2022-10-01 19:46:32 UTC; stokdar
+Repository: CRAN
+Date/Publication: 2022-10-03 07:30:02 UTC
diff --git a/MD5 b/MD5
@@ -0,0 +1,10 @@
+1b6efd1168b683fe995dcf07639be31f *DESCRIPTION
+ce6fe1d55157837a97a823c49139c3a8 *NAMESPACE
+27d6f750b721ebcd1cd8171d4f2fb930 *R/sbde.R
+9f61b84b5a8275ed29da599e9aae2ec9 *man/coef.sbde.Rd
+531d77308a920d7a2de3ec38eb286829 *man/predict.sbde.Rd
+b0fb63b258c89e588cbdebc52bbb7961 *man/sbde-internal.Rd
+1d451c9b579aaad052f7e4dc6447450e *man/sbde.Rd
+c4f97f7a87fd365cce93e0f0159b2ca8 *man/summary.sbde.Rd
+8436449ee6cd073c1bb09b9673fd2ffd *src/registerDynamicSymbol.c
+512033c2eda7a3b920cd60f82711051c *src/sbde.c
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,16 @@
+useDynLib("sbde", .registration = TRUE)
+# Export all functions listed below
+export(
+  sbde
+)
+
+S3method(update, sbde)
+S3method(summary, sbde)
+S3method(coef, sbde)
+S3method(predict, sbde)
+
+# Import all packages listed as Imports or Depends
+import(utils, stats, graphics, grDevices)
+importFrom(coda, geweke.diag)
+importFrom(coda, as.mcmc)
+importFrom(extremefit, hill.adapt)
diff --git a/R/sbde.R b/R/sbde.R
diff --git a/man/coef.sbde.Rd b/man/coef.sbde.Rd
@@ -0,0 +1,34 @@
+\name{coef.sbde}
+\Rdversion{0.0-6}
+\alias{coef.sbde}
+\title{Coefficient Extraction from sbde Model Fit}
+\description{Post process MCMC output from \code{\link{sbde}} to create summaries of parameter and quantile estimates.}
+\usage{ 
+\method{coef}{sbde}(object, burn.perc = 0.5, nmc = 200,
+    prob = c(.001,.01,.1,1:99,99.9,99.99,99.999)/100, ...)
+} 
+\arguments{
+\item{object}{a fitted model of the class \code{sbde}.}
+\item{burn.perc}{a positive fraction indicating what fraction of the saved draws are to be discarded as burn-in}
+\item{nmc}{integer giving the number of samples, post burn-in, to be used in Monte Carlo averaging}
+\item{prob}{a numeric vector of probabiities at which quantiles are to be estimated.}
+\item{...}{not currently implemented}
+}
+\value{
+Extracts posterior summary of model parameters, as well as estimated quantiles. A list is returned invisibly with the following fields.
+
+\item{psamp}{a matrix with 3 columns and \code{nmc} rows storing the posterior draws of the parameters of base distribution used in transformation}
+\item{parametric}{a matrix with posterior median, 2.5th and 97.5th percentiles of the parameters of the base distribution.}
+\item{prob}{numeric vector of probabilities at which quantiles have been estimated. Could differ slightly from the input vector \code{prob}, by removing repetitions, as well as values that are not strictly between zero and one.}
+\item{qsamp}{a matrix with \code{nmc} columns giving the posterior draws of the quantile values at levels given by \code{prob}.}
+\item{qest}{a summary of \code{qsamp} given by the posterior median and 95 precent credible interval end points.}
+\item{ss}{a vector of integers giving the indices of the mcmc samples that were used in posterior summary calculations.}
+}
+
+\seealso{\code{\link{sbde}}, \code{\link{summary.sbde}} and \code{\link{predict.sbde}} for model fitting under sbde.}
+\examples{
+y <- abs(rt(n=1000, df=4))
+fit <- sbde(y, blocking="all", fbase="gpd", verbose=FALSE)
+coef(fit)
+}
+\keyword{programming}
diff --git a/man/predict.sbde.Rd b/man/predict.sbde.Rd
@@ -0,0 +1,32 @@
+\name{predict.sbde}
+\Rdversion{0.0-6}
+\alias{predict.sbde}
+\title{Posterior predictive Summary for Semiparametric Density Estimation}
+\description{Extract posterior predictive density estimate for \code{\link{sbde}}}
+\usage{ 
+\method{predict}{sbde}(object, burn.perc = 0.5, nmc = 200, yRange = range(object$y), yLength = 401, ...)
+} 
+\arguments{
+    \item{object}{a fitted model of the class 'sbde'.}
+    \item{burn.perc}{a positive fraction indicating what fraction of the saved draws are to be discarded as burn-in}
+    \item{nmc}{integer giving the number of samples, post burn-in, to be used in Monte Carlo averaging}
+    \item{yRange}{Range of values over which posterior predictive density is to be evaluated.}
+    \item{yLength}{Number of grid points spanning yRange for posterior predictive density evaluation.}
+    \item{...}{no additional parameters are used.}
+}
+\value{
+Returns a list with three items:
+\item{y}{vector giving the grid over which the posterior predictive density is evaluated.}
+\item{fsamp}{a matrix with \code{yLength} many rows and \code{nmc} many columns. Each column corresponds to a draw of the response density from the posterior predictive.}
+\item{fest}{summary of the posterior predictive density given by point-wise median, 2.5th and 97.5th percentiles.}
+}
+
+\seealso{\code{\link{sbde}}, \code{\link{coef.sbde}} and \code{\link{summary.sbde}}.}
+\examples{
+y <- abs(rt(n=1000, df=4))
+fit <- sbde(y, blocking="all", fbase="gpd", verbose=FALSE)
+pp <- predict(fit)
+hist(y, 50, freq=FALSE)
+with(pp, for(j in 1:3) lines(y, fest[,j], lty=1+(j!=2)))
+}
+\keyword{programming}
diff --git a/man/sbde-internal.Rd b/man/sbde-internal.Rd
@@ -0,0 +1,22 @@
+\name{sbde-internal}
+\Rdversion{0.1-1}
+\alias{lamFn}
+\alias{nuFn}
+\alias{nuFn.inv}
+\alias{sigFn}
+\alias{sigFn.inv}
+\alias{unitFn}
+\alias{sum.sq}
+\alias{extract}
+\alias{logmean}
+\alias{logsum}
+\alias{shrinkFn}
+\alias{trape}
+\alias{klGP}
+\alias{proxFn}
+\alias{transform.grid}
+
+\title{Internal sbde Functions}
+\description{These functions are repeatedly used by the main functions of the package "sbde". Not intended for direct use the by user.}
+
+\keyword{internal}
diff --git a/man/sbde.Rd b/man/sbde.Rd
@@ -0,0 +1,87 @@
+\name{sbde}
+\Rdversion{0.0-6}
+\alias{sbde}
+\alias{update.sbde}
+\title{Bayesian Semiparametric Density Estimation}
+\description{Provides a semiparametric estimation of the density function of independent univariate data.
+}
+\usage{
+sbde(y, nsamp = 1e3, thin = 10, cens = rep(0,length(y)), 
+     wt = rep(1,length(y)), incr = list(knot=0.2, grid=0.01),
+     par = c("Hill-kde", "pmean", "rand")[1], tail.warp = c(0,0), 
+     hyper = list(sig = c(.1,.1), lam = c(6,4), kap = c(1.5,1.5,1)), 
+     prox.range = c(.2,.95), acpt.target = 0.15, ref.size = 3, 
+     blocking = c("all", "gp", "loc+scale+tail"), temp = 1, expo = 2, 
+     blocks.mu, blocks.S, fix.nu = FALSE, 
+     fbase = c("t", "t+", "gpd", "gpd-rescaled", "unif"), 
+     spacing=list(knot="regular", grid="regular"),
+     verbose = TRUE)
+
+\method{update}{sbde}(object, nadd, append = TRUE, ...)
+}
+\arguments{
+\item{y}{numeric vector of response data.}
+\item{nsamp}{number of posterior samples to be saved; defaults to 1000.}
+\item{thin}{thinning rate for the Markov chain sampler -- one posterior sample is saved per \code{thin} iterations. Defaults to 10. The Markov chain sampler runs for a total of \code{nsamp * thin} many iterations.}
+\item{cens}{censoring status of response. Must be a vector of 0s and 1s of length same as length(y), with 1 indicating right censoring, and, 0 indicating no censoring. Defaults to all zeros.}
+\item{wt}{weights attached to the observation units, expected to be non-negative numbers, and defaults to a vector of ones.}
+\item{incr}{a list with two named elements, 'knot' and 'grid', giving the increment sizes for the knots in the predictive process approximation and the grid to be used for logistic Gaussian process likelihood evaluation. Defaults to 0.2 and 0.01 respectively}
+\item{par}{either a numeric vector giving the parameter initialization or a character string indicating how the parameter should be initialized. If input numeric vector length is smaller than required parameter count, then supplied values are appended with zeros to create a full initialization. If input equals "pmean" then the mcmc is initialized at the prior center given by a vector of zeros, or if it equals "rand" then intialization is done by drawing randomly from the prior, or if it equals "Hill-kde" then the Hill estimate is used to estimate the shape parameter, the location and scale parameters are set based on data median and 95th percentile, and the initialization of the Gaussian process is done based on a kernel density estimate of the transformed data.}
+\item{tail.warp}{a non-negative 2-vector giving the degrees of tail warping to be done at each tail. Larger values will allow more variation of the non-parametric density at the corresponding tail. }
+\item{hyper}{hyperparameters of the prior distribution. Must be a list with one or both of the following two fields: \code{lam}: a two vector giving the parameters of the beta distribution on proximity = \eqn{\exp(-0.01* \lambda^2)}{exp(-0.01 * lambda^2)}, and \code{kap}:  a vector to be coerced into a \code{3 * nkap} matrix, with \code{nkap} being the number of components in the mixture of gamma prior on \code{kappa}, and each column of the matrix gives the shape, rate and mixing weight of a component.}
+\item{prox.range}{for specifying the range of length-scale parameter of the Gaussian process prior.}
+\item{acpt.target}{target acceptance rate of the adaptive Metropolis sampler; defaults to 0.15}
+\item{ref.size}{adaptation rate of the adaptive Metropolis sampler. The proposal density is updated once every \code{ref.size} iterations. Could be a single number or a vector of length same as the number of blocks.}
+\item{blocking}{type of blocking to be applied represented by a character vector with elements comprising of the strings: "gp", "loc", "scale", "tail" and their combinations separated by "+". Each of the basic string types will include the corresponding model parameters into the block. For example a valid input could be c("gp", "gp+loc+scale", "loc+scale+tail"), where the first block updates only the Gaussian process parameters, the second block jointly updates the GP parameters and the location and scale, and, the third block updates the location, scale and tail parameters. A combination of all four types can be represented as "all". }
+\item{temp}{temperature of the log-likelihood function. The log-likelihood function is raised to the power of \code{temp}. Defaults to 1.}
+\item{expo}{the exponent to be used in the covariance kernel of the Gaussian process priors. Defaults to 2, giving the standard squared-exponential covariance kernel.}
+\item{blocks.mu}{initial block specific means in the form of  a list. If left unspecified then will be automatically generated as a list of vectors of zeros of appropriate lengths matching the corresponding block sizes.}
+\item{blocks.S}{initial block specific covariance matrices in the form of a list. If left unspecified then will be automatically generated as a list of identity matrices of appropriate dimensions matching the corresponding block sizes.}
+\item{fix.nu}{either the logical FALSE indicating that nu should be learned, or a positive real number giving the fixed value of nu, which is then excluded from MCMC updates}
+\item{fbase}{either "t" (default) or "t+" (for half-t distributions on the positive real lines) or "gpd" (for generalized pareto distributions with location zero and parametrized by nu = 1 / shape) or "gpd-rescaled" (same as gpd, but scale parameter adjusted according to shape so that 90-th percentile matches that of gpd with shape=1/6 and scale=1) or  "unif" to indicate what base distribution is to be used.}
+\item{spacing}{the type of spacing to be used for the predictive process knots and the likelihood evaluation grid. For either object, the default choice is "regular". Any other specification is taken to equal "irregular". A regular grid places points equally between 0 and 1 as given by the prespecified increment value. When the likelihood "grid" is chosen to be "irregular", the regular grid is appended with more points at both extremes by recursive bisection until 1/\code{n} or 1 - 1/\code{n} is reached. For predictive process knots, "irregular" applies only when \code{tail.warp} is different that \code{c(0,0)}, and more knots are appended at each extreme based on how much warping is done to it.}
+\item{verbose}{logical indicating whether MCMC progress should be printed, defaults to TRUE}
+\item{object}{a fitted model of the class 'qde'.}
+\item{nadd}{number of additional MCMC samples.}
+\item{append}{logical indicating whether new samples should be appended to old ones. If FALSE then old samples are discarded.}
+\item{...}{no additional arguments are allowed}
+}
+\value{
+\code{sbde(y, ...)} returns a `sbde' class object to be used by \code{\link{coef}}, \code{\link{summary}} and \code{\link{predict}}.
+
+Returned object is a list containing the following variables.
+\item{par}{latest draw of the parameter vector}
+\item{y}{response vector}
+\item{cens}{censoring status vector}
+\item{wt}{vector of observation weights}
+\item{hyper}{completed list of hyper-parameters}
+\item{dim}{model dimension vector of the form c(n, length of tau grid, position of \eqn{\tau_0}{tau0} on the grid, nknots, length of lambda grid, nkap, total number of MCMC iterations, thin, nsamp)}
+\item{gridmats}{details of covariance matrix factors etc, intended for internal use.}
+\item{tau.g}{the tau grid}
+\item{muV}{list of means for parameter blocks}
+\item{SV}{list of covariance matrices for parameter blocks}
+\item{blocks}{list of blocks}
+\item{blocks.size}{vector of block lengths}
+\item{dmcmcpar}{numeric vector containing details of adaptive MCMC runs, equals c(temp, decay rate of adaptation, vector of target acceptance rates for the blocks, vector of increment scales used in adaptation). Intended strictly for internal use.}
+\item{imcmcpar}{numeric vector containing details of adaptive MCMC runs, equals c(number of parameter blocks, ref.size, indicator on whether details are to be printed during MCMC progress, rate of details printing, a vector of counters needed for printing). Intended strictly for internal use.}
+\item{parsamp}{a long vector containing the parameter draws. Could be coerced into a matrix of dim \code{npar * nsamp}. Intended primarily for use by \code{\link{summary}} and \code{\link{coef}}.}
+\item{acptsamp}{a long vector containing rates of acceptance statistics for parameter blocks. Could be coerced into a matrix of dim \code{nblocks * nsamp}. Not very informative, because thinning times and adaptation times may not be exactly synced.}
+\item{lpsamp}{vector of log posterior values for the saved MCMC draws.}
+\item{other.controls}{a vector of two integers, with the first storing the choice of the fbase, and the second storing the choice of the gridtype.}
+\item{prox}{vector of proximity (exp(-0.01*lambda^2)) grid values}
+\item{runtime}{run time of the MCMC}
+\item{base.bundle}{a list of densty, distribution, quantile etc functions associated with the base distribution.}
+}
+
+\details{
+For positive valued data, it is recommended to use fbase as "gpd", which yields much faster computation than the choice of "t+". The difference is entirely due to difference in machine time needed to compute the CDF of the generalized Pareto versus that of the Student-t. 
+}
+\references{Tokdar, S.T., Jiang, S. and Cunningham, E.L. (2022). Heavy-tailed density estimation. \emph{Journal of the American Statistical Association}, (just-accepted) <https://doi.org/10.1080/01621459.2022.2104727>.}
+\seealso{\code{\link{summary.sbde}}, \code{\link{coef.sbde}} and \code{\link{predict.sbde}}.}
+\examples{
+y <- abs(rt(n=1000, df=4))
+fit <- sbde(y, blocking="all", fbase="gpd", verbose=FALSE)
+coef(fit)
+}
+\keyword{programming}
+
diff --git a/man/summary.sbde.Rd b/man/summary.sbde.Rd
@@ -0,0 +1,37 @@
+\name{summary.sbde}
+\Rdversion{0.0-6}
+\alias{summary.sbde}
+\title{Summary Method for Semiparametric Density Estimation}
+\description{Summarize model fit for \code{\link{sbde}}}
+\usage{ 
+\method{summary}{sbde}(object, ntrace = 1000, burn.perc = 0.5, plot.dev = TRUE,
+        more.details = FALSE, ...)
+} 
+\arguments{
+    \item{object}{a fitted model of the class 'sbde'.}
+    \item{ntrace}{number of draws to be included in trace plots}
+    \item{burn.perc}{fraction of MCMC draws to be discarded as burn-in.}
+    \item{plot.dev}{logical indicator of whether to show trace plot of deviance}
+    \item{more.details}{logical indicating whether other details from MCMC are to be plotted}
+    \item{...}{a limited number of plotting controls that are passed onto the deviance plot}
+}
+\value{
+Displays the trace of the deviance statistic. More details include trace plots of of the proximity parameter of each GP, a plot of Geweke p-values for (from \code{\link{geweke.diag}}) convergence of each model parameter and an image plot of parameter correlation. 
+
+The following quantities are returned invisibly.
+\item{deviance}{vector deviance statistic of the samples parameter draws} 
+\item{pg}{a matrix with \code{nsamp} number of columns. Each column gives the conditional posterior weights on the lambda grid values for the corresponding GP function.}       
+\item{prox}{posterior draws of proximity parameter.}
+\item{ll}{a matrix of \code{n*nsamp} containing observation level log-likelihood contributions. Used to calculate \var{waic}, and could be used for other AIC calculations.}       
+\item{waic}{Two versions of Watanabe AIC from Gelman, Hwang and Vehtari (2014).}
+}
+
+\references{Gelman, A., Hwang, J., and Vehtari, A. (2014). Understanding predictive information criterion for Bayesian models. \emph{Stat Comput}, 24, 997-1016.}
+\seealso{\code{\link{sbde}} and \code{\link{coef.sbde}}.}
+\examples{
+y <- abs(rt(n=1000, df=4))
+fit <- sbde(y, blocking="all", fbase="gpd", verbose=FALSE)
+sm <- summary(fit, more=TRUE)
+print(sm$waic)
+}
+\keyword{programming}
diff --git a/src/registerDynamicSymbol.c b/src/registerDynamicSymbol.c
@@ -0,0 +1,35 @@
+#include <stdlib.h> // for NULL
+#include <R_ext/Rdynload.h>
+
+/* FIXME: 
+   Check these declarations against the C/Fortran source code.
+*/
+
+/* .C calls */
+extern void TOY(double *, double *, int *, int *);
+extern void SBDE(double *, double *, int *, double *, double *,
+                 int *, double *, double *, double *, double *,
+                 int *, int *, double *, int *, double *,
+                 double *, double *, int *);
+extern void DEV(double *, double *, int *, double *, double *,
+                int *, double *, double *, double *, double *,
+                double *, double *, int *);
+extern void PRED(double *, double *, double *, int *, double *,
+                 double *, double *, int *);
+extern void QUANT(double *, double *, double *, int *, double *,
+                  double *, double *, int *);
+
+static const R_CMethodDef CEntries[] = {
+    {"TOY", (DL_FUNC) &TOY, 4},
+    {"SBDE", (DL_FUNC) &SBDE, 18},
+    {"DEV",  (DL_FUNC) &DEV,  13},
+    {"PRED", (DL_FUNC) &PRED,  8},
+    {"QUANT", (DL_FUNC) &QUANT, 8},
+    {NULL, NULL, 0}
+};
+
+void R_init_sbde(DllInfo *dll)
+{
+    R_registerRoutines(dll, CEntries, NULL, NULL, NULL);
+    R_useDynamicSymbols(dll, FALSE);
+}