R/source_sba.R

#---------------------------------------------------------------
# Source functions for semiparametric Bayesian regression
#---------------------------------------------------------------
#' Semiparametric Bayesian linear model
#'
#' Monte Carlo sampling for Bayesian linear regression with an
#' unknown (nonparametric) transformation. A g-prior is assumed
#' for the regression coefficients.
#'
#' @param y \code{n x 1} response vector
#' @param X \code{n x p} matrix of predictors
#' @param X_test \code{n_test x p} matrix of predictors for test data;
#' default is the observed covariates \code{X}
#' @param psi prior variance (g-prior)
#' @param laplace_approx logical; if TRUE, use a normal approximation
#' to the posterior in the definition of the transformation;
#' otherwise the prior is used
#' @param approx_g logical; if TRUE, apply large-sample
#' approximation for the transformation
#' @param nsave number of Monte Carlo simulations
#' @param ngrid number of grid points for inverse approximations
#' @param verbose logical; if TRUE, print time remaining
#' @return a list with the following elements:
#' \itemize{
#' \item \code{coefficients} the posterior mean of the regression coefficients
#' \item \code{fitted.values} the posterior predictive mean at the test points \code{X_test}
#' \item \code{post_theta}: \code{nsave x p} samples from the posterior distribution
#' of the regression coefficients
#' \item \code{post_ypred}: \code{nsave x n_test} samples
#' from the posterior predictive distribution at test points \code{X_test}
#' \item \code{post_g}: \code{nsave} posterior samples of the transformation
#' evaluated at the unique \code{y} values
#' \item \code{model}: the model fit (here, \code{sblm})
#' }
#' as well as the arguments passed in.
#'
#' @details This function provides fully Bayesian inference for a
#' transformed linear model using Monte Carlo (not MCMC) sampling.
#' The transformation is modeled as unknown and learned jointly
#' with the regression coefficients (unless \code{approx_g} = TRUE, which then uses
#' a point approximation). This model applies for real-valued data, positive data, and
#' compactly-supported data (the support is automatically deduced from the observed \code{y} values).
#' The results are typically unchanged whether \code{laplace_approx} is TRUE/FALSE;
#' setting it to TRUE may reduce sensitivity to the prior, while setting it to FALSE
#' may speed up computations for very large datasets.
#'
#' @examples
#' \donttest{
#' # Simulate some data:
#' dat = simulate_tlm(n = 100, p = 5, g_type = 'step')
#' y = dat$y; X = dat$X # training data
#' y_test = dat$y_test; X_test = dat$X_test # testing data
#'
#' hist(y, breaks = 25) # marginal distribution
#'
#' # Fit the semiparametric Bayesian linear model:
#' fit = sblm(y = y, X = X, X_test = X_test)
#' names(fit) # what is returned
#'
#' # Note: this is Monte Carlo sampling, so no need for MCMC diagnostics!
#'
#' # Evaluate posterior predictive means and intervals on the testing data:
#' plot_pptest(fit$post_ypred, y_test,
#'             alpha_level = 0.10) # coverage should be about 90%
#'
#' # Check: correlation with true coefficients
#' cor(dat$beta_true[-1],
#'     coef(fit)[-1]) # excluding the intercept
#'
#' # Summarize the transformation:
#' y0 = sort(unique(y)) # posterior draws of g are evaluated at the unique y observations
#' plot(y0, fit$post_g[1,], type='n', ylim = range(fit$post_g),
#'      xlab = 'y', ylab = 'g(y)', main = "Posterior draws of the transformation")
#' temp = sapply(1:nrow(fit$post_g), function(s)
#'   lines(y0, fit$post_g[s,], col='gray')) # posterior draws
#' lines(y0, colMeans(fit$post_g), lwd = 3) # posterior mean
#'
#' # Add the true transformation, rescaled for easier comparisons:
#' lines(y,
#'       scale(dat$g_true)*sd(colMeans(fit$post_g)) + mean(colMeans(fit$post_g)), type='p', pch=2)
#' legend('bottomright', c('Truth'), pch = 2) # annotate the true transformation
#'
#' # Posterior predictive checks on testing data: empirical CDF
#' y0 = sort(unique(y_test))
#' plot(y0, y0, type='n', ylim = c(0,1),
#'      xlab='y', ylab='F_y', main = 'Posterior predictive ECDF')
#' temp = sapply(1:nrow(fit$post_ypred), function(s)
#'   lines(y0, ecdf(fit$post_ypred[s,])(y0), # ECDF of posterior predictive draws
#'         col='gray', type ='s'))
#' lines(y0, ecdf(y_test)(y0),  # ECDF of testing data
#'      col='black', type = 's', lwd = 3)
#' }
#' @export
sblm = function(y, X, X_test = X,
                psi = length(y),
                laplace_approx = TRUE,
                approx_g = FALSE,
                nsave = 1000,
                ngrid = 100,
                verbose = TRUE){

  # For testing:
  # X_test = X; psi = length(y); laplace_approx = TRUE; approx_g = FALSE; nsave = 1000; verbose = TRUE; ngrid = 100

  # Data dimensions:
  n = length(y); p = ncol(X)

  # Testing data points:
  if(!is.matrix(X_test)) X_test = matrix(X_test, nrow  = 1)

  # And some checks on columns:
  if(p >= n) stop('The g-prior requires p < n')
  if(p != ncol(X_test)) stop('X_test and X must have the same number of columns')
  #----------------------------------------------------------------------------
  # Key matrix quantities:
  XtX = crossprod(X)
  XtXinv = chol2inv(chol(XtX))
  xt_Sigma_x = sapply(1:n, function(i)
    crossprod(X[i,], XtXinv)%*%X[i,])
  #----------------------------------------------------------------------------
  # Initialize the transformation:

  # Define the CDF of y:
  Fy = function(t) n/(n+1)*ecdf(y)(t)

  # Evaluate at the unique y-values:
  y0 = sort(unique(y))
  Fy_eval = Fy(y0)

  # Grid of values for the CDF of z (based on the prior)
  z_grid = sort(unique(
    sapply(range(psi*xt_Sigma_x), function(xtemp){
      qnorm(seq(0.01, 0.99, length.out = ngrid),
            mean = 0, # assuming prior mean zero
            sd = sqrt(1 + xtemp))
    })
  ))

  # Define the moments of the CDF of z:
  if(laplace_approx){
    # Use a normal approximation for the posterior of theta

    # Recurring terms:
    Sigma_hat_unscaled = psi/(1+psi)*XtXinv # unscaled covariance (w/o sigma)
    xt_Sigma_hat_unscaled_x = sapply(1:n, function(i)
      crossprod(X[i,], Sigma_hat_unscaled)%*%X[i,])

    # First pass: fix Fz() = qnorm(), initialize coefficients
    z = qnorm(Fy(y))
    theta_hat = Sigma_hat_unscaled%*%crossprod(X, z) # point estimate

    # Alternative rank-based approaches (SLOW! but similar...)
    # theta_hat = rank_approx(y, X) # alternative rank-based approach
    # theta_hat= coef(Rfit::rfit(y  ~ X))[-1] # faster, but less accurate
    # that function also comes with a sd estimate

    # Second pass: update g(), then update coefficients
    # Moments of Z|X:
    mu_z = X%*%theta_hat
    sigma_z = sqrt(1 + xt_Sigma_hat_unscaled_x)

    # CDF of z:
    Fz_eval = Fz_fun(z = z_grid,
                     weights = rep(1/n, n),
                     mean_vec = mu_z,
                     sd_vec = sigma_z)

    # Check: update the grid if needed
    zcon = contract_grid(z = z_grid,
                         Fz = Fz_eval,
                         lower = 0.001, upper =  0.999)
    z_grid = zcon$z; Fz_eval = zcon$Fz

    # Transformation:
    g = g_fun(y = y0,
              Fy_eval = Fy_eval,
              z = z_grid,
              Fz_eval = Fz_eval)

    # Updated coefficients:
    z = g(y) # update latent data
    theta_hat = Sigma_hat_unscaled%*%crossprod(X, z) # updated coefficients

    # Moments of Z|X:
    mu_z = X%*%theta_hat
    #sigma_z = sqrt(1 + xt_Sigma_hat_unscaled_x) # no need to update

  } else {

    # Prior mean is zero:
    mu_z = rep(0, n)

    # Marginal SD based on prior:
    sigma_z = sqrt(1 + psi*xt_Sigma_x)
  }

  # Define the CDF of z:
  Fz_eval = Fz_fun(z = z_grid,
                   weights = rep(1/n, n),
                   mean_vec = mu_z,
                   sd_vec = sigma_z)

  # Check: update the grid if needed
  zcon = contract_grid(z = z_grid,
                       Fz = Fz_eval,
                       lower = 0.001, upper =  0.999)
  z_grid = zcon$z; Fz_eval = zcon$Fz

  # Compute the transformation:
  g = g_fun(y = y0, Fy_eval = Fy_eval,
            z = z_grid, Fz_eval = Fz_eval)

  # Latent data:
  z = g(y)

  # Define the grid for approximations using equally-spaced + quantile points:
  y_grid = sort(unique(c(
    seq(min(y), max(y), length.out = ngrid/2),
    quantile(y0, seq(0, 1, length.out = ngrid/2)))))

  # Inverse transformation function:
  g_inv = g_inv_approx(g = g, t_grid = y_grid)
  #----------------------------------------------------------------------------
  # Store MC output:
  post_theta = array(NA, c(nsave, p))
  post_ypred = array(NA, c(nsave, nrow(X_test)))
  post_g = array(NA, c(nsave, length(y0)))

  # Run the MC:
  if(verbose) timer0 = proc.time()[3] # For timing the sampler
  for(nsi in 1:nsave){

    # NOTE: we could do this in blocks, perhaps more efficiently...

    #----------------------------------------------------------------------------
    # Block 1: sample the transformation
    if(!approx_g){

      # Bayesian bootstrap for the CDFs

      # Dirichlet(1) weights for y:
      weights_y = rgamma(n = n, shape = 1)
      weights_y  = weights_y/sum(weights_y)

      # Dirichlet(1) weights for x:
      weights_x = rgamma(n = n, shape = 1)
      weights_x  = weights_x/sum(weights_x)

      # BB CDF of y:
      Fy_eval = sapply(y0, function(t)
        n/(n+1)*sum(weights_y[y <= t]))

      # BB CDF of z:
      Fz_eval = Fz_fun(z = z_grid,
                       weights = weights_x,
                       mean_vec = mu_z,
                       sd_vec = sigma_z)

      # Compute the transformation:
      g = g_fun(y = y0, Fy_eval = Fy_eval,
                z = z_grid, Fz_eval = Fz_eval)

      # Update z:
      z = g(y)

      # Update the inverse transformation function:
      g_inv = g_inv_approx(g = g, t_grid = y_grid)
    }
    #----------------------------------------------------------------------------
    # Block 2: sample the scale adjustment (SD)
    SSR_psi = sum(z^2) - psi/(psi+1)*crossprod(z, X%*%XtXinv%*%crossprod(X, z))
    sigma_epsilon = 1/sqrt(rgamma(n = 1,
                                  shape = .001 + n/2,
                                  rate = .001 + SSR_psi/2))
    #----------------------------------------------------------------------------
    # Block 3: sample the regression coefficients
    ch_Q = chol(1/sigma_epsilon^2*(1+psi)/(psi)*XtX)
    ell_theta = 1/sigma_epsilon^2*crossprod(X, z)
    theta = backsolve(ch_Q,
                     forwardsolve(t(ch_Q), ell_theta) +
                       rnorm(p))
    #----------------------------------------------------------------------------
    # Store the MC:

    # Posterior samples of the model parameters:
    post_theta[nsi,] = theta

    # Predictive samples of ytilde:
    ztilde = X_test%*%theta + sigma_epsilon*rnorm(n = nrow(X_test))
    post_ypred[nsi,] = g_inv(ztilde)

    # Posterior samples of the transformation:
    post_g[nsi,] = g(y0)
    #----------------------------------------------------------------------------
    if(verbose) computeTimeRemaining(nsi, timer0, nsave, nrep = ceiling(nsave/3))
  }
  if(verbose) print(paste('Total time: ', round((proc.time()[3] - timer0)), 'seconds'))

  return(list(
    coefficients = colMeans(post_theta),
    fitted.values = colMeans(post_ypred),
    post_theta = post_theta,
    post_ypred = post_ypred,
    post_g = post_g,
    model = 'sblm', y = y, X = X, X_test = X_test, psi = psi, approx_g = approx_g, sigma_epsilon = sigma_epsilon))
}
#---------------------------------------------------------------
#' Semiparametric Bayesian spline model
#'
#' Monte Carlo sampling for Bayesian spline regression with an
#' unknown (nonparametric) transformation.
#'
#' @param y \code{n x 1} response vector
#' @param x \code{n x 1} vector of observation points; if NULL, assume equally-spaced on [0,1]
#' @param x_test \code{n_test x 1} vector of testing points; if NULL, assume equal to \code{x}
#' @param psi prior variance (inverse smoothing parameter); if NULL,
#' sample this parameter
#' @param laplace_approx logical; if TRUE, use a normal approximation
#' to the posterior in the definition of the transformation;
#' otherwise the prior is used
#' @param approx_g logical; if TRUE, apply large-sample
#' approximation for the transformation
#' @param nsave number of Monte Carlo simulations
#' @param ngrid number of grid points for inverse approximations
#' @param verbose logical; if TRUE, print time remaining
#' @return a list with the following elements:
#' \itemize{
#' \item \code{coefficients} the posterior mean of the regression coefficients
#' \item \code{fitted.values} the posterior predictive mean at the test points \code{x_test}
#' \item \code{post_theta}: \code{nsave x p} samples from the posterior distribution
#' of the regression coefficients
#' \item \code{post_ypred}: \code{nsave x n_test} samples
#' from the posterior predictive distribution at \code{x_test}
#' \item \code{post_g}: \code{nsave} posterior samples of the transformation
#' evaluated at the unique \code{y} values
#' \item \code{model}: the model fit (here, \code{sbsm})
#' }
#' as well as the arguments passed in.
#'
#' @details This function provides fully Bayesian inference for a
#' transformed spline regression model using Monte Carlo (not MCMC) sampling.
#' The transformation is modeled as unknown and learned jointly
#' with the regression function (unless \code{approx_g} = TRUE, which then uses
#' a point approximation). This model applies for real-valued data, positive data, and
#' compactly-supported data (the support is automatically deduced from the observed \code{y} values).
#' The results are typically unchanged whether \code{laplace_approx} is TRUE/FALSE;
#' setting it to TRUE may reduce sensitivity to the prior, while setting it to FALSE
#' may speed up computations for very large datasets.
#'
#' @examples
#' \donttest{
#' # Simulate some data:
#' n = 100 # sample size
#' x = sort(runif(n)) # observation points
#'
#' # Transform a noisy, periodic function:
#' y = g_inv_bc(
#'   sin(2*pi*x) + sin(4*pi*x) + rnorm(n, sd = .5),
#'              lambda = .5) # Signed square-root transformation
#'
#' # Fit the semiparametric Bayesian spline model:
#' fit = sbsm(y = y, x = x)
#' names(fit) # what is returned
#'
#' # Note: this is Monte Carlo sampling, so no need for MCMC diagnostics!
#'
#' # Plot the model predictions (point and interval estimates):
#' pi_y = t(apply(fit$post_ypred, 2, quantile, c(0.05, .95))) # 90% PI
#' plot(x, y, type='n', ylim = range(pi_y,y),
#'      xlab = 'x', ylab = 'y', main = paste('Fitted values and prediction intervals'))
#' polygon(c(x, rev(x)),c(pi_y[,2], rev(pi_y[,1])),col='gray', border=NA)
#' lines(x, y, type='p')
#' lines(x, fitted(fit), lwd = 3)
#' }
#' @importFrom spikeSlabGAM sm
#' @export
sbsm = function(y, x = NULL,
                x_test = NULL,
                psi = NULL,
                laplace_approx = TRUE,
                approx_g = FALSE,
                nsave = 1000,
                ngrid = 100,
                verbose = TRUE){

  # For testing:
  # psi = length(y); laplace_approx = TRUE; approx_g = FALSE; nsave = 1000; verbose = TRUE; x_test = sort(runif(100)); ngrid = 100

  # Data dimensions:
  n = length(y)

  # Observation points:
  if(is.null(x)) x = seq(0, 1, length=n)
  if(is.null(x_test)) x_test = x

  # Recale to [0,1]:
  x = (x - min(x))/(max(x) - min(x))
  x_test = (x_test - min(x_test))/(max(x_test) - min(x_test))
  #----------------------------------------------------------------------------
  # Orthogonalized P-spline and related quantities:
  X = cbind(1/sqrt(n), poly(x, 1), sm(x))
  X = X/sqrt(sum(diag(crossprod(X))))
  diagXtX = colSums(X^2)
  p = length(diagXtX)

  # Recurring term:
  xt_Sigma_x = rowSums(X^2) # sapply(1:n, function(i) sum(X[i,]^2/diagXtX))

  # Smoothing parameter:
  if(is.null(psi)){

    # Flag to sample:
    sample_psi = TRUE

    # Initialize:
    psi = n

  } else sample_psi = FALSE
  #----------------------------------------------------------------------------
  # Initialize the transformation:

  # Define the CDF of y:
  Fy = function(t) n/(n+1)*ecdf(y)(t)

  # Evaluate at the unique y-values:
  y0 = sort(unique(y))
  Fy_eval = Fy(y0)

  # Grid of values for the CDF of z:
  z_grid = sort(unique(
    sapply(range(psi*xt_Sigma_x), function(xtemp){
      qnorm(seq(0.01, 0.99, length.out = ngrid),
            mean = 0, # assuming prior mean zero
            sd = sqrt(1 + xtemp))
    })
  ))

  # Define the moments of the CDF of z:
  if(laplace_approx){
    # Use a normal approximation for the posterior of theta

    # Recurring terms:
    diag_Sigma_hat_unscaled = 1/(diagXtX + 1/psi)
    xt_Sigma_hat_unscaled_x = colSums(t(X^2)*diag_Sigma_hat_unscaled)
    #xt_Sigma_hat_unscaled_x = sapply(1:n, function(i)
    #  crossprod(X[i,], diag(diag_Sigma_hat_unscaled))%*%X[i,])

    # First pass: fix Fz() = qnorm(), initialize coefficients
    z = qnorm(Fy(y))
    theta_hat = diag_Sigma_hat_unscaled*crossprod(X, z) # point estimate

    # Second pass: update g(), then update coefficients
    # Moments of Z|X:
    mu_z = X%*%theta_hat
    sigma_z = sqrt(1 + xt_Sigma_hat_unscaled_x)

    # CDF of z:
    Fz_eval = Fz_fun(z = z_grid,
                     weights = rep(1/n, n),
                     mean_vec = mu_z,
                     sd_vec = sigma_z)

    # Check: update the grid if needed
    zcon = contract_grid(z = z_grid,
                         Fz = Fz_eval,
                         lower = 0.001, upper =  0.999)
    z_grid = zcon$z; Fz_eval = zcon$Fz

    # Transformation:
    g = g_fun(y = y0,
              Fy_eval = Fy_eval,
              z = z_grid,
              Fz_eval = Fz_eval)

    # Updated coefficients:
    z = g(y) # update latent data
    theta_hat = diag_Sigma_hat_unscaled*crossprod(X, z) # updated coefficients

    # Moments of Z|X:
    mu_z = X%*%theta_hat
    #sigma_z = sqrt(1 + xt_Sigma_hat_unscaled_x)

  } else {

    # Prior mean is zero:
    mu_z = rep(0, n)

    # Marginal SD based on prior:
    sigma_z = sqrt(1 + psi*xt_Sigma_x)
  }

  # Define the CDF of z:
  Fz_eval = Fz_fun(z = z_grid,
                   weights = rep(1/n, n),
                   mean_vec = mu_z,
                   sd_vec = sigma_z)

  # Check: update the grid if needed
  zcon = contract_grid(z = z_grid,
                       Fz = Fz_eval,
                       lower = 0.001, upper =  0.999)
  z_grid = zcon$z; Fz_eval = zcon$Fz

  # Compute the transformation:
  g = g_fun(y = y0, Fy_eval = Fy_eval,
            z = z_grid, Fz_eval = Fz_eval)

  # Latent data:
  z = g(y)

  # Define the grid for approximations using equally-spaced + quantile points:
  y_grid = sort(unique(c(
    seq(min(y), max(y), length.out = ngrid/2),
    quantile(y0, seq(0, 1, length.out = ngrid/2)))))

  # Inverse transformation function:
  g_inv = g_inv_approx(g = g, t_grid = y_grid)

  # For fully BNP, do not sample psi:
  #----------------------------------------------------------------------------
  # Store MC output:
  post_theta = array(NA, c(nsave, p))
  post_ypred = array(NA, c(nsave, length(x_test)))
  post_g = array(NA, c(nsave, length(y0)))
  post_psi = rep(NA, nsave)

  # Run the MC:
  if(verbose) timer0 = proc.time()[3] # For timing the sampler
  for(nsi in 1:nsave){
    #----------------------------------------------------------------------------
    # Block 1: sample the transformation
    if(!approx_g){

      # Bayesian bootstrap for the CDFs

      # Dirichlet(1) weights for y:
      weights_y = rgamma(n = n, shape = 1)
      weights_y  = weights_y/sum(weights_y)

      # Dirichlet(1) weights for x:
      weights_x = rgamma(n = n, shape = 1)
      weights_x  = weights_x/sum(weights_x)

      # BB CDF of y: (NOTE could be faster!)
      Fy_eval = sapply(y0, function(t)
        n/(n+1)*sum(weights_y[y <= t]))

      # BB CDF of z:
      Fz_eval = Fz_fun(z = z_grid,
                       weights = weights_x,
                       mean_vec = mu_z,
                       sd_vec = sigma_z)

      # Compute the transformation:
      g = g_fun(y = y0, Fy_eval = Fy_eval,
                z = z_grid, Fz_eval = Fz_eval)

      # Update z:
      z = g(y)

      # Update the inverse transformation function:
      g_inv = g_inv_approx(g = g, t_grid = y_grid)
    }
    #----------------------------------------------------------------------------
    # Block 2: sample the scale adjustment (SD)
    # SSR_psi = sum(z^2) - crossprod(z, X%*%solve(crossprod(X) + diag(1/psi, p))%*%crossprod(X,z))
    SSR_psi = sum(z^2) - crossprod(1/sqrt(diagXtX + 1/psi)*crossprod(X, z))
    sigma_epsilon = 1/sqrt(rgamma(n = 1,
                                  shape = .001 + n/2,
                                  rate = .001 + SSR_psi/2))
    #----------------------------------------------------------------------------
    # Block 3: sample the regression coefficients
    Q_theta = 1/sigma_epsilon^2*(diagXtX + 1/psi)
    ell_theta = 1/sigma_epsilon^2*crossprod(X, z)
    theta = rnorm(n = p,
                 mean = Q_theta^-1*ell_theta,
                 sd = sqrt(Q_theta^-1))
    #----------------------------------------------------------------------------
    # Block 4: sample the smoothing parameter
    if(sample_psi){
      psi = 1/rgamma(n = 1,
                     shape = 0.01 + p/2,
                     rate = 0.01 + sum(theta^2)/(2*sigma_epsilon^2))
    }
    #----------------------------------------------------------------------------
    # Store the MC:

    # Posterior samples of the model parameters:
    post_theta[nsi,] = theta

    # Predictive samples of ytilde:
    # Note: it's easier/faster to just smooth for the testing points
    # (the orthogonalized basis is a pain to recompute)
    ztilde = stats::spline(x = x, y = X%*%theta, xout = x_test)$y +
      sigma_epsilon*rnorm(n = length(x_test))
    post_ypred[nsi,] = g_inv(ztilde)

    # Posterior samples of the transformation:
    post_g[nsi,] = g(y0)

    post_psi[nsi] = psi
    #----------------------------------------------------------------------------
    if(verbose) computeTimeRemaining(nsi, timer0, nsave, nrep = ceiling(nsave/2))
  }
  if(verbose) print(paste('Total time: ', round((proc.time()[3] - timer0)), 'seconds'))

  return(list(
    coefficients = colMeans(post_theta),
    fitted.values = colMeans(post_ypred),
    post_theta = post_theta,
    post_ypred = post_ypred,
    post_g = post_g,  post_psi = post_psi,
    model = 'sbsm', y = y, X = X, psi = psi, approx_g = approx_g, sigma_epsilon = sigma_epsilon))
}
#---------------------------------------------------------------
#' Semiparametric Bayesian Gaussian processes
#'
#' Monte Carlo sampling for Bayesian Gaussian process regression with an
#' unknown (nonparametric) transformation.
#'
#' @param y \code{n x 1} response vector
#' @param locs \code{n x d} matrix of locations
#' @param X \code{n x p} design matrix; if unspecified, use intercept only
#' @param covfun_name string name of a covariance function; see ?GpGp
#' @param locs_test \code{n_test x d} matrix of locations
#' at which predictions are needed; default is \code{locs}
#' @param X_test \code{n_test x p} design matrix for test data;
#' default is \code{X}
#' @param nn number of nearest neighbors to use; default is 30
#' (larger values improve the approximation but increase computing cost)
#' @param emp_bayes logical; if TRUE, use a (faster!) empirical Bayes
#' approach for estimating the mean function
#' @param approx_g logical; if TRUE, apply large-sample
#' approximation for the transformation
#' @param nsave number of Monte Carlo simulations
#' @param ngrid number of grid points for inverse approximations
#' @return a list with the following elements:
#' \itemize{
#' \item \code{coefficients} the estimated regression coefficients
#' \item \code{fitted.values} the posterior predictive mean at the test points \code{locs_test}
#' \item \code{fit_gp} the fitted \code{GpGp_fit} object, which includes
#' covariance parameter estimates and other model information
#' \item \code{post_ypred}: \code{nsave x ntest} samples
#' from the posterior predictive distribution at \code{locs_test}
#' \item \code{post_g}: \code{nsave} posterior samples of the transformation
#' evaluated at the unique \code{y} values
#' \item \code{model}: the model fit (here, \code{sbgp})
#' }
#' as well as the arguments passed in.
#'
#' @details This function provides Bayesian inference for a
#' transformed Gaussian process model using Monte Carlo (not MCMC) sampling.
#' The transformation is modeled as unknown and learned jointly
#' with the regression function (unless \code{approx_g} = TRUE, which then uses
#' a point approximation). This model applies for real-valued data, positive data, and
#' compactly-supported data (the support is automatically deduced from the observed \code{y} values).
#' The results are typically unchanged whether \code{laplace_approx} is TRUE/FALSE;
#' setting it to TRUE may reduce sensitivity to the prior, while setting it to FALSE
#' may speed up computations for very large datasets. For computational efficiency,
#' the Gaussian process parameters are fixed at point estimates, and the latent Gaussian
#' process is only sampled when \code{emp_bayes} = FALSE. However, the uncertainty
#' from this term is often negligible compared to the observation errors, and the
#' transformation serves as an additional layer of robustness.
#'
#' @examples
#' \donttest{
#' # Simulate some data:
#' n = 200 # sample size
#' x = seq(0, 1, length = n) # observation points
#'
#' # Transform a noisy, periodic function:
#' y = g_inv_bc(
#'   sin(2*pi*x) + sin(4*pi*x) + rnorm(n, sd = .5),
#'              lambda = .5) # Signed square-root transformation
#'
#' # Fit the semiparametric Bayesian Gaussian process:
#' fit = sbgp(y = y, locs = x)
#' names(fit) # what is returned
#' coef(fit) # estimated regression coefficients (here, just an intercept)
#' class(fit$fit_gp) # the GpGp object is also returned
#'
#' # Plot the model predictions (point and interval estimates):
#' pi_y = t(apply(fit$post_ypred, 2, quantile, c(0.05, .95))) # 90% PI
#' plot(x, y, type='n', ylim = range(pi_y,y),
#'      xlab = 'x', ylab = 'y', main = paste('Fitted values and prediction intervals'))
#' polygon(c(x, rev(x)),c(pi_y[,2], rev(pi_y[,1])),col='gray', border=NA)
#' lines(x, y, type='p')
#' lines(x, fitted(fit), lwd = 3)
#' }
#' @import GpGp fields
#' @export
sbgp = function(y, locs,
                X = NULL,
                covfun_name = "matern_isotropic",
                locs_test = locs,
                X_test = NULL,
                nn = 30,
                emp_bayes = TRUE,
                approx_g = FALSE,
                nsave = 1000,
                ngrid = 100){

  # library(GpGp) # see if this fixes it...
  # For testing:
  # X = matrix(1, nrow = length(y)); covfun_name = "matern_isotropic"; locs_test = locs; X_test = X; nn = 30; emp_bayes = TRUE; approx_g = FALSE; nsave = 1000; ngrid = 100

  # Data dimensions:
  y = as.matrix(y); n = length(y);
  locs = as.matrix(locs); d = ncol(locs)

  # Testing data:
  locs_test = as.matrix(locs_test); n_test = nrow(locs_test)

  # Covariates:
  if(is.null(X)) X = matrix(1, nrow = n)
  if(is.null(X_test)){ # supply our own testing matrix
    if(isTRUE(all.equal(locs, locs_test))){
      # If the training and testing points are the same,
      # then we input the same design matrix for the testing:
      X_test = X
    } else {
      # Otherwise, use an intercept-only with the correct dimensions:
      X_test = matrix(1, n_test)
    }
  }

  # And check:
  X = as.matrix(X); p = ncol(X)
  X_test = as.matrix(X_test)

  # Some checks needed for locs, locs_test, X, X_test
  if(nrow(locs) != n || nrow(X) != n || nrow(X_test) != n_test ||
     ncol(X_test) != p || ncol(locs_test) != d){
    stop('Check input dimensions!')
  }

  # To avoid errors for small n:
  nn = min(nn, n-1)

  # This is a temporary hack needed for sampling w/ one-dimensional inputs:
  if(!emp_bayes && d==1){
    aug = 1e-6*rnorm(n)
    locs = cbind(locs, aug)
    locs_test = cbind(locs_test, aug)
  }

  # Define the CDF of y:
  Fy = function(t) n/(n+1)*ecdf(y)(t)
  #----------------------------------------------------------------------------
  print('Initial GP fit...')
  # Initial GP fit:
  z = qnorm(Fy(y))
  fit_gp = GpGp::fit_model(y = z,
                           locs = locs,
                           X = X,
                           covfun_name = covfun_name,
                           m_seq = nn,
                           silent = TRUE)

  # Fitted values for observed data:
  mu_z = GpGp::predictions(fit = fit_gp,
                            locs_pred = locs,
                            X_pred  = X)
  # SD of latent term:
  # Nugget variance (NOTE: this works for most (but not all!) covariance functions in GpGp!)
  sigma_epsilon = sqrt(fit_gp$covparms[1]*fit_gp$covparms[length(fit_gp$covparms)])
  if(n < 1000){

    # Compute the covariance matrix, but remove the nugget:
    K_theta = do.call(covfun_name, list(fit_gp$covparms, locs ))
    #K_theta = match.fun(covfun_name)(fit_gp$covparms, locs)
    diag(K_theta) = diag(K_theta) - sigma_epsilon^2

    # Posterior covariance for mu requires inverses:
    K_theta_inv = chol2inv(chol(K_theta))
    Sigma_mu = chol2inv(chol(K_theta_inv + diag(1/sigma_epsilon^2, n)))

    # Extract the diagonal elements for z:
    sigma_z = sqrt(sigma_epsilon^2 + diag(Sigma_mu))

  } else {

    # Ignore the uncertainty in the regression function,
    # which is likely small when n is very large (also much faster...)
    sigma_z = rep(sigma_epsilon, n)

  }
  #----------------------------------------------------------------------------
  # Initialize the transformation:

  # Evaluate CDF of Y at the unique y-values:
  y0 = sort(unique(y)); Fy_eval = Fy(y0)

  # Grid of values for the CDF of z:
  z_grid = sort(unique(
    sapply(range(mu_z), function(xtemp){
      qnorm(seq(0.01, 0.99, length.out = ngrid),
            mean = xtemp,
            sd = sigma_epsilon)
    })
  ))

  # Evaluate the CDF of z:
  Fz_eval = Fz_fun(z = z_grid,
                   weights = rep(1/n, n),
                   mean_vec = mu_z,
                   sd_vec = sigma_z)

  # Compute the transformation:
  g = g_fun(y = y0, Fy_eval = Fy_eval,
            z = z_grid, Fz_eval = Fz_eval)

  # Latent data:
  z = g(y)

  # Define the grid for approximations using equally-spaced + quantile points:
  y_grid = sort(unique(c(
    seq(min(y), max(y), length.out = ngrid/2),
    quantile(y0, seq(0, 1, length.out = ngrid/2)))))

  # Inverse transformation function:
  g_inv = g_inv_approx(g = g, t_grid = y_grid)
  #----------------------------------------------------------------------------
  print('Updated GP fit...')
  # Now update the GP coefficients:
  fit_gp = GpGp::fit_model(y = z,
                           locs = locs,
                           X = X,
                           covfun_name = covfun_name,
                           start_parms = fit_gp$covparms,
                           m_seq = nn,
                           silent = TRUE)

  # Fitted values for observed data:
  mu_z = GpGp::predictions(fit = fit_gp,
                           locs_pred = locs,
                           X_pred  = X,
                           m = nn)

  # Fitted values for testing data:
  if(isTRUE(all.equal(X, X_test)) &&
       isTRUE(all.equal(locs, locs_test))){
    # If the testing and training data are identical,
    # then there is no need to apply a separate predict function
    z_test = mu_z
  } else {
    z_test = GpGp::predictions(fit = fit_gp,
                               locs_pred = locs_test,
                               X_pred  = X_test,
                               m = nn)
  }

  # SD of latent term:
  # Nugget variance (NOTE: this works for most (but not all!) covariance functions in GpGp!)
  sigma_epsilon = sqrt(fit_gp$covparms[1]*fit_gp$covparms[length(fit_gp$covparms)])
  if(n < 1000){

    # Compute the covariance matrix, but remove the nugget:
    K_theta = do.call(covfun_name, list(fit_gp$covparms, locs ))
    #K_theta = match.fun(covfun_name)(fit_gp$covparms, locs)
    diag(K_theta) = diag(K_theta) - sigma_epsilon^2

    # Posterior covariance for mu requires inverses:
    K_theta_inv = chol2inv(chol(K_theta))
    Sigma_mu = chol2inv(chol(K_theta_inv + diag(1/sigma_epsilon^2, n)))

    # Extract the diagonal elements for z:
    sigma_z = sqrt(sigma_epsilon^2 + diag(Sigma_mu))

  } else {
    # Ignore the uncertainty in the regression function,
    # which is likely small when n is very large (this is also faster...)
    sigma_z = rep(sigma_epsilon, n)
  }

  # Estimated coefficients:
  theta = fit_gp$betahat
  #----------------------------------------------------------------------------
  # Store MC output:
  post_ypred = array(NA, c(nsave, n_test))
  post_g = array(NA, c(nsave, length(y0)))

  print('Sampling...')

  # Run the MC:
  for(nsi in 1:nsave){
    #----------------------------------------------------------------------------
    # Sample the transformation
    if(!approx_g){

      # Bayesian bootstrap for the CDFs

      # Dirichlet(1) weights for y:
      weights_y = rgamma(n = n, shape = 1)
      weights_y  = weights_y/sum(weights_y)

      # Dirichlet(1) weights for x:
      weights_x = rgamma(n = n, shape = 1)
      weights_x  = weights_x/sum(weights_x)

      # BB CDF of y:
      Fy_eval = sapply(y0, function(t)
        n/(n+1)*sum(weights_y[y <= t]))

      # BB CDF of z:
      Fz_eval = Fz_fun(z = z_grid,
                       weights = weights_x,
                       mean_vec = mu_z,
                       sd_vec = sigma_z)

      # Compute the transformation:
      g = g_fun(y = y0, Fy_eval = Fy_eval,
                z = z_grid, Fz_eval = Fz_eval)

      # Update z:
      z = g(y)

      # Update the inverse transformation function:
      g_inv = g_inv_approx(g = g, t_grid = y_grid)
    }
    #----------------------------------------------------------------------------
    # Store the MC:

    # Predictive samples of ytilde:
    if(emp_bayes){
      ztilde = z_test + sigma_epsilon*rnorm(n = n_test)
    } else {
      ztilde = cond_sim(fit = fit_gp,
                        locs_pred = locs_test,
                        X_pred = X_test,
                        m = nn)
    }
    post_ypred[nsi,] = g_inv(ztilde)

    # Posterior samples of the transformation:
    post_g[nsi,] = g(y0)
    #----------------------------------------------------------------------------
  }
  print('Done!')

  return(list(
    coefficients = theta,
    fitted.values = colMeans(post_ypred),
    fit_gp = fit_gp,
    post_ypred = post_ypred,
    post_g = post_g,
    model = 'sbgp', y = y, X = X, approx_g = approx_g, sigma_epsilon = sigma_epsilon))
}
#' Semiparametric Bayesian quantile regression
#'
#' MCMC sampling for Bayesian quantile regression with an
#' unknown (nonparametric) transformation. Like in traditional Bayesian
#' quantile regression, an asymmetric Laplace distribution is assumed
#' for the errors, so the regression models targets the specified quantile.
#' However, these models are often woefully inadequate for describing
#' observed data. We introduce a nonparametric transformation to
#' improve model adequacy while still providing inference for the
#' regression coefficients and the specified quantile. A g-prior is assumed
#' for the regression coefficients.
#'
#' @param y \code{n x 1} response vector
#' @param X \code{n x p} matrix of predictors
#' @param tau the target quantile (between zero and one)
#' @param X_test \code{n_test x p} matrix of predictors for test data;
#' default is the observed covariates \code{X}
#' @param psi prior variance (g-prior)
#' @param laplace_approx logical; if TRUE, use a normal approximation
#' to the posterior in the definition of the transformation;
#' otherwise the prior is used
#' @param approx_g logical; if TRUE, apply large-sample
#' approximation for the transformation
#' @param nsave number of MCMC iterations to save
#' @param nburn number of MCMC iterations to discard
#' @param ngrid number of grid points for inverse approximations
#' @param verbose logical; if TRUE, print time remaining
#' @return a list with the following elements:
#' \itemize{
#' \item \code{coefficients} the posterior mean of the regression coefficients
#' \item \code{fitted.values} the estimated \code{tau}th quantile at test points \code{X_test}
#' \item \code{post_theta}: \code{nsave x p} samples from the posterior distribution
#' of the regression coefficients
#' \item \code{post_ypred}: \code{nsave x n_test} samples
#' from the posterior predictive distribution at test points \code{X_test}
#' \item \code{post_qtau}: \code{nsave x n_test} samples of the \code{tau}th conditional quantile at test points \code{X_test}
#' \item \code{post_g}: \code{nsave} posterior samples of the transformation
#' evaluated at the unique \code{y} values
#' \item \code{model}: the model fit (here, \code{sbqr})
#' }
#' as well as the arguments passed in.
#'
#' @details This function provides fully Bayesian inference for a
#' transformed quantile linear model.
#' The transformation is modeled as unknown and learned jointly
#' with the regression coefficients (unless \code{approx_g} = TRUE, which then uses
#' a point approximation). This model applies for real-valued data, positive data, and
#' compactly-supported data (the support is automatically deduced from the observed \code{y} values).
#' The results are typically unchanged whether \code{laplace_approx} is TRUE/FALSE;
#' setting it to TRUE may reduce sensitivity to the prior, while setting it to FALSE
#' may speed up computations for very large datasets.
#'
#' @examples
#' \donttest{
#' # Simulate some heteroskedastic data (no transformation):
#' dat = simulate_tlm(n = 200, p = 10, g_type = 'box-cox', heterosked = TRUE, lambda = 1)
#' y = dat$y; X = dat$X # training data
#' y_test = dat$y_test; X_test = dat$X_test # testing data
#'
#' # Target this quantile:
#' tau = 0.05
#'
#' # Fit the semiparametric Bayesian quantile regression model:
#' fit = sbqr(y = y, X = X, tau = tau, X_test = X_test)
#' names(fit) # what is returned
#'
#' # Posterior predictive checks on testing data: empirical CDF
#' y0 = sort(unique(y_test))
#' plot(y0, y0, type='n', ylim = c(0,1),
#'      xlab='y', ylab='F_y', main = 'Posterior predictive ECDF')
#' temp = sapply(1:nrow(fit$post_ypred), function(s)
#'   lines(y0, ecdf(fit$post_ypred[s,])(y0), # ECDF of posterior predictive draws
#'         col='gray', type ='s'))
#' lines(y0, ecdf(y_test)(y0),  # ECDF of testing data
#'      col='black', type = 's', lwd = 3)
#' }
#'
#' @importFrom quantreg rq
#' @importFrom statmod rinvgauss
#' @export
sbqr = function(y, X, tau = 0.5,
                X_test = X,
                psi = length(y),
                laplace_approx = TRUE,
                approx_g = FALSE,
                nsave = 1000,
                nburn = 100,
                ngrid = 100,
                verbose = TRUE){

  # For testing:
  # psi = length(y); laplace_approx = TRUE; approx_g = FALSE; nsave = 1000; nburn = 100; verbose = TRUE; ngrid = 100

  # Data dimensions:
  n = length(y); p = ncol(X)

  # Testing data points:
  if(!is.matrix(X_test)) X_test = matrix(X_test, nrow  = 1)
  n_test = nrow(X_test)

  # And some checks on columns:
  if(p >= n) stop('The g-prior requires p < n')
  if(p != ncol(X_test)) stop('X_test and X must have the same number of columns')

  # Recurring terms:
  a_tau = (1-2*tau)/(tau*(1-tau))
  b_tau = sqrt(2/(tau*(1-tau)))

  # Key matrix quantities:
  XtX = crossprod(X)
  XtXinv = chol2inv(chol(XtX))
  xt_Sigma_x = sapply(1:n, function(i)
    crossprod(X[i,], XtXinv)%*%X[i,])
  #----------------------------------------------------------------------------
  # Initialize the transformation:

  # Define the CDF of y:
  Fy = function(t) n/(n+1)*ecdf(y)(t)

  # Evaluate at the unique y-values:
  y0 = sort(unique(y))
  Fy_eval = Fy(y0)

  # Grid of values for the CDF of z:
  z_grid = sort(unique(
    sapply(range(psi*xt_Sigma_x), function(xtemp){
      qnorm(seq(0.001, 0.999, length.out = ngrid),
            mean = 0 + a_tau, # assuming prior mean zero
            sd = sqrt(b_tau^2 + xtemp))
    })
  ))

  # Define the moments of the CDF of z (BEFORE parameter expansions)
  if(laplace_approx){
    # Use a normal approximation for the posterior of theta

    # First pass: fix Fz() = qnorm(), initialize coefficients
    z = qnorm(Fy(y))
    fit  = rq(z ~ X - 1, tau = tau)
    Sigma_hat = summary(fit, se = 'boot', covariance = TRUE)$cov
    xt_Sigma_hat_x = sapply(1:n, function(i)
      crossprod(X[i,], Sigma_hat)%*%X[i,])

    # Moments of X%*%theta:
    mu_z = fitted(fit)
    sigma_z = sqrt(xt_Sigma_hat_x)

    # CDF of z, using Monte Carlo:
    Fz_eval = rowMeans(sapply(rexp(n = 100), function(xi_s){
      Fz_fun(z = z_grid,
             weights = rep(1/n, n),
             mean_vec = mu_z + a_tau*xi_s,
             sd_vec = sqrt(sigma_z^2 + xi_s*b_tau^2))
    }))

    # Check: update the grid if needed
    zcon = contract_grid(z = z_grid,
                         Fz = Fz_eval,
                         lower = 0.001, upper =  0.999)
    z_grid = zcon$z; Fz_eval = zcon$Fz

    # Transformation:
    g = g_fun(y = y0,
              Fy_eval = Fy_eval,
              z = z_grid,
              Fz_eval = Fz_eval)

    # Updated coefficients:
    z = g(y) # update latent data
    fit  = rq(z ~ X - 1, tau = tau)
    Sigma_hat = summary(fit, se = 'boot', covariance = TRUE)$cov
    xt_Sigma_hat_x = sapply(1:n, function(i)
      crossprod(X[i,], Sigma_hat)%*%X[i,])

    # Moments of X%*%theta:
    mu_z = fitted(fit)
    sigma_z = sqrt(xt_Sigma_hat_x)

  } else {

    # Using the prior
    mu_z = rep(0, n)
    sigma_z = sqrt(psi*xt_Sigma_x)
  }

  # CDF of z, using Monte Carlo:
  Fz_eval = rowMeans(sapply(rexp(n = 100), function(xi_s){
    Fz_fun(z = z_grid,
           weights = rep(1/n, n),
           mean_vec = mu_z + a_tau*xi_s,
           sd_vec = sqrt(sigma_z^2 + xi_s*b_tau^2))
  }))

  # Check: update the grid if needed
  zcon = contract_grid(z = z_grid,
                       Fz = Fz_eval,
                       lower = 0.001, upper =  0.999)
  z_grid = zcon$z; Fz_eval = zcon$Fz

  # Compute the transformation:
  g = g_fun(y = y0, Fy_eval = Fy_eval,
            z = z_grid, Fz_eval = Fz_eval)

  # Latent data:
  z = g(y)

  # Define the grid for approximations using equally-spaced + quantile points:
  y_grid = sort(unique(c(
    seq(min(y), max(y), length.out = ngrid/2),
    quantile(y0, seq(0, 1, length.out = ngrid/2)))))

  # Inverse transformation function:
  g_inv = g_inv_approx(g = g, t_grid = y_grid)
  #----------------------------------------------------------------------------
  # Initialize the parameters:

  # Coefficients:
  theta = chol2inv(chol(XtX))%*%crossprod(X,z)
  #----------------------------------------------------------------------------
  # Store MC output:
  post_theta = array(NA, c(nsave, p))
  post_ypred = post_qtau = array(NA, c(nsave, n_test))
  post_g = array(NA, c(nsave, length(y0)))

  # Run the MCMC:
  if(verbose) timer0 = proc.time()[3] # For timing the sampler
  for(nsi in 1:(nburn + nsave)){

    #----------------------------------------------------------------------------
    # Block 1: sample the transformation
    if(!approx_g){

      # Bayesian bootstrap for the CDFs

      # Dirichlet(1) weights for y:
      weights_y = rgamma(n = n, shape = 1)
      weights_y  = weights_y/sum(weights_y)

      # Dirichlet(1) weights for x:
      weights_x = rgamma(n = n, shape = 1)
      weights_x  = weights_x/sum(weights_x)

      # BB CDF of y:
      Fy_eval = sapply(y0, function(t)
        n/(n+1)*sum(weights_y[y <= t]))

      # BB CDF of z:
      # (MC averages are good even for few replicates!)
      Fz_eval = rowMeans(sapply(rexp(n = 10), function(xi_s){
        Fz_fun(z = z_grid,
               weights = weights_x,
               mean_vec = mu_z + a_tau*xi_s,
               sd_vec = sqrt(sigma_z^2 + xi_s*b_tau^2))
      }))

      # Compute the transformation:
      g = g_fun(y = y0, Fy_eval = Fy_eval,
                z = z_grid, Fz_eval = Fz_eval)

      # Update z:
      z = g(y)

      # Update the inverse transformation function:
      g_inv = g_inv_approx(g = g, t_grid = y_grid)
    }
    #----------------------------------------------------------------------------
    # Block 2: parameter expansion
    xi = 1/rinvgauss(n = n,
                     mean = sqrt((2 + a_tau^2/b_tau^2)/((z - X%*%theta)^2/b_tau^2)),
                     shape = 2 + a_tau^2/b_tau^2)
    #----------------------------------------------------------------------------
    # Block 3: sample the regression coefficients
    Q_theta = crossprod(X/sqrt(b_tau^2*xi)) + 1/psi*XtX # t(X)%*%diag(1/(b_tau^2*xi))%*%X + 1/psi*XtX
    ell_theta = crossprod(X/(b_tau^2*xi), z - a_tau*xi) # t(X)%*%diag(1/(b_tau^2*xi))%*%(z - a_tau*xi)
    ch_Q = chol(Q_theta)
    theta = backsolve(ch_Q,
                      forwardsolve(t(ch_Q), ell_theta) +
                        rnorm(p))
    #----------------------------------------------------------------------------
    # Store the MCMC:
    if(nsi > nburn){
      # Posterior samples of the model parameters:
      post_theta[nsi - nburn,] = theta

      # Quantile at the testing point:
      post_qtau[nsi - nburn,] = g_inv(X_test%*%theta)

      # Predictive samples of ytilde:
      xi_test = rexp(n = n_test, rate = 1)
      ztilde = X_test%*%theta + a_tau*xi_test + b_tau*sqrt(xi_test)*rnorm(n = n_test)
      post_ypred[nsi - nburn,] = g_inv(ztilde)

      # Posterior samples of the transformation:
      post_g[nsi - nburn,] = g(y0)
    }
    #----------------------------------------------------------------------------
    if(verbose) computeTimeRemaining(nsi, timer0, nsave, nrep = ceiling(nsave/3))
  }
  if(verbose) print(paste('Total time: ', round((proc.time()[3] - timer0)), 'seconds'))

  return(list(
    coefficients = colMeans(post_theta),
    fitted.values = colMeans(post_qtau),
    post_theta = post_theta,
    post_ypred = post_ypred,
    post_qtau = post_qtau,
    post_g = post_g,
    model = 'sbqr', y = y, X = X, X_test = X_test, psi = psi, approx_g = approx_g, tau = tau))
}
#---------------------------------------------------------------
#' Post-processing with importance sampling
#'
#' Given Monte Carlo draws from the surrogate posterior,
#' apply sampling importance reweighting (SIR) to
#' correct for the true model likelihood.
#'
#' @param fit a fitted model object that includes
#' \itemize{
#' \item \code{coefficients} the posterior mean of the regression coefficients
#' \item \code{post_theta}: \code{nsave x p} samples from the posterior distribution
#' of the regression coefficients
#' \item \code{post_ypred}: \code{nsave x n_test} samples
#' from the posterior predictive distribution at test points \code{X_test}
#' \item \code{post_g}: \code{nsave} posterior samples of the transformation
#' evaluated at the unique \code{y} values
#' \item \code{model}: the model fit (\code{sblm} or \code{sbsm})
#' }
#' @param sir_frac fraction of draws to sample for SIR
#' @param nsims_prior number of draws from the prior
#' @param verbose logical; if TRUE, print time remaining
#' @return the fitted model object with the posterior draws subsampled
#' based on the SIR adjustment
#'
#' @details The Monte Carlo sampling for \code{\link{sblm}} and
#' \code{\link{sbsm}} uses a surrogate likelihood for posterior inference,
#' which enables much faster and easier computing. SIR provides a correction for
#' the actual (specified) likelihood. However, this correction
#' step is quite slow and typically does not produce any noticeable
#' discrepancies, even for small sample sizes.
#'
#' @note SIR sampling is done WITHOUT replacement, so \code{sir_frac}
#' is typically between 0.1 and 0.5. The \code{nsims_priors} draws
#' are used to approximate a prior expectation, but larger values
#' can significantly slow down this function.
#'
#' @examples
#' \donttest{
#' # Simulate some data:
#' dat = simulate_tlm(n = 50, p = 5, g_type = 'step')
#' y = dat$y; X = dat$X # training data
#' y_test = dat$y_test; X_test = dat$X_test # testing data
#'
#' hist(y, breaks = 10) # marginal distribution
#'
#' # Fit the semiparametric Bayesian linear model:
#' fit = sblm(y = y, X = X, X_test = X_test)
#' names(fit) # what is returned
#'
#' # Update with SIR:
#' fit_sir = sir_adjust(fit)
#'
#' # Prediction: unadjusted vs. adjusted?
#'
#' # Point estimates:
#' y_hat = fitted(fit)
#' y_hat_sir = fitted(fit_sir)
#' cor(y_hat, y_hat_sir) # similar
#'
#' # Interval estimates:
#' pi_y = t(apply(fit$post_ypred, 2, quantile, c(0.05, .95))) # 90% PI
#' pi_y_sir = t(apply(fit_sir$post_ypred, 2, quantile, c(0.05, .95))) # 90% PI
#'
#' # PI overlap (%):
#' overlaps = 100*sapply(1:length(y_test), function(i){
#'   # innermost part
#'   (min(pi_y[i,2], pi_y_sir[i,2]) - max(pi_y[i,1], pi_y_sir[i,1]))/
#'     # outermost part
#'     (max(pi_y[i,2], pi_y_sir[i,2]) - min(pi_y[i,1], pi_y_sir[i,1]))
#' })
#' summary(overlaps) # mostly close to 100%
#'
#' # Coverage of PIs on testing data (should be ~ 90%)
#' mean((pi_y[,1] <= y_test)*(pi_y[,2] >= y_test)) # unadjusted
#' mean((pi_y_sir[,1] <= y_test)*(pi_y_sir[,2] >= y_test)) # adjusted
#'
#' # Plot together with testing data:
#' plot(y_test, y_test, type='n', ylim = range(pi_y, pi_y_sir, y_test),
#'      xlab = 'y_test', ylab = 'y_hat', main = paste('Prediction intervals: testing data'))
#' abline(0,1) # reference line
#' suppressWarnings(
#'   arrows(y_test, pi_y[,1], y_test, pi_y[,2],
#'          length=0.15, angle=90, code=3, col='gray', lwd=2)
#' ) # plot the PIs (unadjusted)
#' suppressWarnings(
#'   arrows(y_test, pi_y_sir[,1], y_test, pi_y_sir[,2],
#'          length=0.15, angle=90, code=3, col='darkgray', lwd=2)
#' ) # plot the PIs (adjusted)
#' lines(y_test, y_hat, type='p', pch=2) # plot the means (unadjusted)
#' lines(y_test, y_hat_sir, type='p', pch=3) # plot the means (adjusted)
#' }
#' @importFrom MASS mvrnorm
#' @export
sir_adjust = function(fit,
                      sir_frac = 0.3,
                      nsims_prior = 100,
                      verbose = TRUE){
  # Checks:
  if(fit$model == 'sbgp'){
    warning('sbgp uses an empirical bayes approximation for the parameters,
    so SIR sampling is not needed; returning the original fit...')
    return(fit)
  }
  if(fit$model != 'sblm' &&
     fit$model != 'sbsm')
    stop('Currently implemented for sblm and sbsm only')

  if(sir_frac <= 0 || sir_frac >= 1){
    stop('sir_frac must be between zero and one')
  }

  # Extract some recurring terms:
  nsave = nrow(fit$post_ypred) # number of draws
  y = fit$y; n = length(y);
  X = fit$X; p = ncol(X)

  # Useful for matching:
  ind_y = match(y, sort(unique(y)))

  # Not identified, but fix at whatever value was used:
  sigma_epsilon = fit$sigma_epsilon

  # Sample from the prior:

  # Linear model case:
  if(fit$model == 'sblm'){

    # Matrix quantities (note: these *could* be passed in from fit)
    XtXinv = chol2inv(chol(crossprod(X)))
    xt_Sigma_x = sapply(1:n, function(i)
      crossprod(X[i,], XtXinv)%*%X[i,])

    # Sample:
    prior_theta = mvrnorm(n = nsims_prior,
                          mu = rep(0,p),
                          Sigma = sigma_epsilon^2*fit$psi*XtXinv)
  }

  # Spline case:
  if(fit$model == 'sbsm'){

    # Matrix quantities (note: these *could* be passed in from fit)
    xt_Sigma_x = rowSums(X^2)

    # Sample:
    prior_theta = matrix(rnorm(n = p*nsims_prior,
                               mean = 0,
                               sd = sigma_epsilon*sqrt(fit$psi)),
                         nrow = nsims_prior)
  }

  # Compute the log-importance weights:
  post_logw = rep(NA, nsave)
  if(verbose) timer0 = proc.time()[3] # For timing the sampler
  for(nsi in 1:nsave){

    # Dirichlet(1) weights for x:
    weights_x = rgamma(n = n, shape = 1)
    weights_x  = weights_x/sum(weights_x)

    # Extract g(y), properly matched:
    z = fit$post_g[nsi, ind_y]

    # linear model or spline:
    if(fit$model == 'sblm' ||
       fit$model == 'sbsm' ){

      # Log-numerator:
      log_numer = log(mean(sapply(1:nsims_prior, function(s){
        prod(sapply(1:n, function(i){
          sum(weights_x*dnorm(z[i],
                              mean = X%*%prior_theta[s,],
                              sd = sigma_epsilon))
        }))
      })))

      # Log-denominator
      log_denom = sum(log(
        sapply(1:n, function(i){
          sum(weights_x*dnorm(z[i],
                              mean = 0,
                              sd = sigma_epsilon*sqrt(1 + fit$psi*xt_Sigma_x)))
        })
      ))
    }

    # Store the log-weights:
    post_logw[nsi] = log_numer - log_denom

    # Check the timing:
    if(verbose) computeTimeRemaining(nsi, timer0, nsave, nrep = nsave/10)
  }
  if(verbose) print(paste('Total time: ', round((proc.time()[3] - timer0)), 'seconds'))

  # Indices to keep:
  ind_sir = sample(1:nsave,
                   size = ceiling(sir_frac*nsave),
                   prob = exp(post_logw),
                   replace = FALSE)
  # Update:
  fit$post_theta = fit$post_theta[ind_sir,];
  fit$coefficients = colMeans(fit$post_theta)
  fit$post_ypred = fit$post_ypred[ind_sir,]
  fit$post_g = fit$post_g[ind_sir,]

  return(fit)
}
#---------------------------------------------------------------
#' Compute the transformation
#'
#' Given the CDFs of z and y, compute a smoothed function
#' to evaluate the transformation
#'
#' @param y vector of points at which the CDF of y is evaluated
#' @param Fy_eval CDF of y evaluated at \code{y}
#' @param z vector of points at which the CDF of z is evaluated
#' @param Fz_eval CDF of z evaluated at \code{z}
#' @return A smooth monotone function which can be used for evaluations of the transformation.
#' @importFrom stats splinefun
g_fun = function(y, Fy_eval, z, Fz_eval){

  # Quick checks:
  if(length(y) != length(Fy_eval)){
    stop('length of y must equal length of Fy_eval')
  }
  if(length(z) != length(Fz_eval)){
    stop('length of z must equal length of Fz_eval')
  }

  # Remove duplicates:
  z_unique = which(!duplicated(Fz_eval))
  Fz_eval = Fz_eval[z_unique]; z = z[z_unique]
  y_unique = which(!duplicated(Fy_eval))
  Fy_eval = Fy_eval[y_unique]; y = y[y_unique]

  # Inverse the CDF of z:
  Fz_inv = function(s) stats::spline(Fz_eval, z, method = "hyman",
                                     xout = s)$y
  # Ref: https://stats.stackexchange.com/questions/390931/compute-quantile-function-from-a-mixture-of-normal-distribution/390936#390936

  # Check:
  # plot(z, Fz_inv(Fz_eval)); abline(0,1)

  # Initial transformation:
  g0 = Fz_inv(Fy_eval)

  # Make sure we have only finite values of g0 (infinite values occur for F_y = 0 or F_y = 1)
  y = y[which(is.finite(g0))]; g0 = g0[which(is.finite(g0))]

  # Return the smoothed (monotone) transformation:
  return(stats::splinefun(y, g0, method = 'monoH.FC'))
}
#----------------------------------------------------------------------------
#' Approximate inverse transformation
#'
#' Compute the inverse function of a transformation \code{g} based on a grid search.
#'
#' @param g the transformation function
#' @param t_grid grid of arguments at which to evaluate the transformation function
#' @return A function which can be used for evaluations of the
#' (approximate) inverse transformation function.
g_inv_approx = function(g, t_grid) {

  # Evaluate g() on the grid:
  g_grid = g(t_grid)

  # Approximate inverse function:
  function(s) {
    sapply(s, function(si)
      t_grid[which.min(abs(si - g_grid))])
  }
}
#---------------------------------------------------------------
#' Compute the latent data CDF
#'
#' Assuming a Gaussian latent data distribution (given x),
#' compute the CDF on a grid of points
#'
#' @param z vector of points at which the CDF of z is evaluated
#' @param weights \code{n}-dimensional vector of weights; if NULL,
#' assume 1/n
#' @param mean_vec \code{n}-dimensional vector of means; if NULL,
#' assume mean zero
#' @param sd_vec \code{n}-dimensional vector of standard deviations
#' @return CDF of z evaluated at \code{z}
Fz_fun = function(z,
                  weights = NULL,
                  mean_vec = NULL,
                  sd_vec){
  # Dimension:
  n = length(sd_vec)

  # Checks:
  if(is.null(weights)) weights = rep(1/n, n)
  if(is.null(mean_vec)) mean_vec = rep(0, n)

  # Return:
  rowSums(sapply(1:n, function(i){
    weights[i]*pnorm(z,
                mean = mean_vec[i],
                sd = sd_vec[i])
  }))
}
#' Grid contraction
#'
#' Contract the grid if the evaluation points exceed some threshold.
#' This removes the corresponding z values.
#' We can add points back to achieve the same (approximate) length.
#'
#' @param z grid points (ordered)
#' @param Fz function evaluated at those grid points
#' @param lower lower threshold at which to check Fz
#' @param upper upper threshold at which to check Fz
#' @param add_back logical; if true, expand the grid to
#' (about) the original size
#' @param monotone logical; if true, enforce monotonicity
#' on the expanded grid
#' @return a list containing the grid points \code{z} and the (interpolated) function
#' \code{Fz} at those points
contract_grid = function(z, Fz, lower, upper, add_back = TRUE, monotone = TRUE){

  # Identify the offending indexes:
  bad_ind = which(Fz < lower | Fz > upper)

  if(length(bad_ind) > 0){

    # Original:
    z0 = z

    # Original length
    ngrid = length(z)

    # Subset:
    z = z[-bad_ind]

    # Add back, if desired:
    if(add_back){
      z = sort(unique(c(z,
                        seq(min(z),
                            max(z),
                            length.out = ngrid - length(z) + 2))))
    }
    if(monotone){
      Fz = stats::spline(x = z0[-bad_ind], y = Fz[-bad_ind], method = 'hyman', xout = z)$y
    } else {
      Fz = stats::spline(x = z0[-bad_ind], y = Fz[-bad_ind], xout = z)$y
    }
  }
  list(z = z,
       Fz = Fz)
}