version 0.1.0

cran · Sep 7, 2023 · aa964d1 · aa964d1
commit aa964d1
Show file tree

Hide file tree

Showing 17 changed files with 1,785 additions and 0 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,26 @@
+Package: jointCalib
+Type: Package
+Title: A Joint Calibration of Totals and Quantiles
+Version: 0.1.0
+Authors@R: 
+  c(person(given = "Maciej",
+             family = "Beręsewicz",
+             role = c("aut", "cre"),
+             email = "maciej.beresewicz@ue.poznan.pl",
+             comment = c(ORCID = "0000-0002-8281-4301")))
+Description: A small package containing functions to perform a joint calibration of totals and quantiles. The calibration for totals is based on Deville and Särndal (1992) <doi:10.1080/01621459.1992.10475217>, the calibration for quantiles is based on Harms and Duchesne (2006) <https://www150.statcan.gc.ca/n1/en/catalogue/12-001-X20060019255>. The package uses standard calibration via the 'survey', 'sampling' or 'laeken' packages. In addition, entropy balancing via the 'ebal' package and empirical likelihood based on codes from Wu (2005) <https://www150.statcan.gc.ca/n1/pub/12-001-x/2005002/article/9051-eng.pdf> can be used. See the paper by Beręsewicz and Szymkowiak (2023) for details <arXiv:2308.13281>.
+License: GPL-3
+Encoding: UTF-8
+RdMacros: mathjaxr
+Depends: R (>= 3.5.0)
+URL: https://github.com/ncn-foreigners/jointCalib,
+        https://ncn-foreigners.github.io/jointCalib/
+BugReports: https://github.com/ncn-foreigners/jointCalib/issues
+RoxygenNote: 7.2.3
+Imports: laeken, sampling, mathjaxr, survey, MASS, ebal
+NeedsCompilation: no
+Packaged: 2023-09-06 11:40:07 UTC; berenz
+Author: Maciej Beręsewicz [aut, cre] (<https://orcid.org/0000-0002-8281-4301>)
+Maintainer: Maciej Beręsewicz <maciej.beresewicz@ue.poznan.pl>
+Repository: CRAN
+Date/Publication: 2023-09-07 08:30:06 UTC
diff --git a/MD5 b/MD5
@@ -0,0 +1,16 @@
+fbbe99afadea6d9109830a8920fb1968 *DESCRIPTION
+b197971c633ff14b3071fbf4d88145db *NAMESPACE
+549d963935e251710f36b5a72d3283f0 *R/calib_el.R
+81c4bd6b4fc0f032de105a8a5763c859 *R/controls.R
+627598dc89c32650bdd2488825976542 *R/joint_calib.R
+9242b8b778c5b48495aa337d77a2ff95 *R/joint_calib_create_matrix.R
+e7a2d4e989792d5cc42a4fcb8cc85914 *R/methods.R
+a7252e11d3b0670b014236e06a3f5ef7 *README.md
+99ef76ec5b9583fb2bb39fe77755dbf9 *build/jointCalib.pdf
+62ca2f2beebe8ef91134883c48a4316e *build/stage23.rdb
+2f673b77995fc83d38014a93d9b41e0c *inst/tinytest/test_create_matrix.R
+5e6abf714c16b892d9402fe761e42de8 *inst/tinytest/test_joint_calib.R
+6b7b01cf24dfc7b3b35d5dcdd8e278de *man/calib_el.Rd
+e71a8b79de9c07fba9c3cdbbafccd646 *man/control_calib.Rd
+713ccc40a6ef169421c3390a5f2d297c *man/joint_calib.Rd
+1d8fa83949efb8df2981ae1ee6bebdd9 *man/joint_calib_create_matrix.Rd
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,8 @@
+# Generated by roxygen2: do not edit by hand
+
+S3method(print,jointCalib)
+export(calib_el)
+export(control_calib)
+export(joint_calib)
+export(joint_calib_create_matrix)
+import(mathjaxr)
diff --git a/R/calib_el.R b/R/calib_el.R
@@ -0,0 +1,122 @@
+#' @import mathjaxr
+NULL
+#' @title An internal function for calibration of weights using empirical likelihood method
+#'
+#' @author Maciej Beręsewicz based on Wu (2005) and Zhang, Han and Wu (2022)
+#'
+#' \loadmathjax
+#' @description
+#' \code{calib_el} performs calibration using empirical likelihood (EL) method. The function is taken from Wu (2005), if algorithm has problem with convergence codes from Zhang, Han and Wu (2022) using \code{constrOptim} is used.
+#'
+#' In (pseudo) EL the following (pseudo) EL function is maximized
+#'
+#' \mjsdeqn{\sum_{i \in r} d_i\log(p_i),}
+#'
+#' under the following constraint
+#'
+#' \mjsdeqn{\sum_{i \in r} p_i = 1,}
+#'
+#' with constraints on quantiles (with notation as in Harms and Duchesne (2006))
+#'
+#' \mjsdeqn{\sum_{i \in r} p_i(a_{i} - \alpha/N) = 0,}
+#'
+#' where \mjseqn{a_{i}} is created using \code{joint_calib_create_matrix} function, and possibly means
+#'
+#' \mjsdeqn{\sum_{i \in r} p_i(x_{i} - \mu_{x}) = 0,}
+#'
+#' where \mjseqn{\mu_{x}} is known population mean of X. For simplicity of notation we assume only one quantile and one mean is known. This can be generalized to multiple quantiles and means.
+#'
+#' @param X matrix of variables for calibration of quantiles and totals (first column should be intercept),
+#' @param d initial d-weights for calibration (e.g. design-weights),
+#' @param totals vector of totals (where 1 element is the population size),
+#' @param maxit a numeric value giving the maximum number of iterations,
+#' @param tol the desired accuracy for the iterative procedure,
+#' @param eps the desired accuracy for computing the Moore-Penrose generalized inverse (see [MASS::ginv()]),
+#' @param ... arguments passed to [stats::optim] via [stats::constrOptim].
+#'
+#' @references
+#' Wu, C. (2005). Algorithms and R codes for the pseudo empirical likelihood method in survey sampling. Survey Methodology, 31(2), 239 (code is taken from \url{https://sas.uwaterloo.ca/~cbwu/Rcodes/LagrangeM2.txt}).
+#'
+#' Zhang, S., Han, P., and Wu, C. (2023) Calibration Techniques Encompassing Survey Sampling, Missing Data Analysis and Causal Inference. International Statistical Review, 91: 165–192. https://doi.org/10.1111/insr.12518 (code is taken from Supplementary Materials).
+#'
+#' @return Returns a vector of empirical likelihood g-weights
+#'
+#' @examples
+#' ## generate data based on Haziza and Lesage (2016)
+#' set.seed(123)
+#' N <- 1000
+#' x <- runif(N, 0, 80)
+#' y <- exp(-0.1 + 0.1*x) + rnorm(N, 0, 300)
+#' p <- rbinom(N, 1, prob = exp(-0.2 - 0.014*x))
+#' totals_known <- c(N=N, x=sum(x))
+#' df <- data.frame(x, y, p)
+#' df_resp <- df[df$p == 1, ]
+#' df_resp$d <- N/nrow(df_resp)
+#' res <- calib_el(X = model.matrix(~x, df_resp),
+#'                 d = df_resp$d,
+#'                 totals = totals_known)
+#' data.frame(known = totals_known, estimated=colSums(res*df_resp$d*model.matrix(~x, df_resp)))
+#'
+#' @export
+calib_el <- function(X, d, totals, maxit=50, tol=1e-8, eps = .Machine$double.eps, ...) {
+  n_col <- NCOL(X[, -1])
+  n_row <- NROW(X)
+  N <- totals[1]
+  mu <- totals[-1]/N
+  U <- X[, -1] - rep(1, n_row) %*% t(mu)
+  lambda <- numeric(n_col)
+  dif <- 1
+  k <- 0
+  while (dif > tol & k <= maxit) {
+    p <- (1 + U %*% lambda)
+    D1 <- t(U) %*% as.vector(d / p)
+    DD <- -t(U) %*% (as.vector(d / p^2) * U)
+    D2 <- try(MASS::ginv(DD, tol = eps) %*% D1)
+    if (class(D2)[1] == "try-error") break
+    dif <- max(abs(D2))
+    ################################################
+    ## here we use the same stoping rule as in sampling::calib
+    ## tr <- crossprod(X, d/p)
+    ## dif <- max(abs(tr - totals)/totals)
+    ################################################
+    rule <- 1
+    while (rule > 0) {
+      rule <- 0
+      if (min(1 + t(lambda - D2) %*% t(U)) <= 0)
+        rule <- rule + 1
+      if (rule > 0)
+        D2 <- D2 / 2
+    }
+    lambda <- lambda - D2
+
+    k <- k + 1
+  }
+  if (k >= maxit | class(D2)[1] == "try-error") {
+
+    message("Maximum iteration exceeded or MASS::ginv resulted in error. Changing to stats::constrOptim.")
+
+    ll_fun <- function(d, rho, g_hat) {
+      -sum(d*log(1 + g_hat %*% rho))
+    }
+    ll_grad <- function(d, rho, g_hat) {
+      -colSums(d*g_hat / c(1 + g_hat %*% rho))
+    }
+
+    rho_hat <-
+      stats::constrOptim(
+        theta = rep(0, n_col),
+        f = ll_fun,
+        grad = ll_grad,
+        method = "BFGS",
+        ui = U,
+        ci = rep(1 / n_row - 1, n_row),
+        g_hat = U,
+        d = d,
+        control = list(maxit = maxit, ...)
+      )
+    lambda <- rho_hat$par
+  }
+
+  gweight <- as.numeric(1/(1 + U %*% lambda))
+  return(gweight)
+}
diff --git a/R/controls.R b/R/controls.R
@@ -0,0 +1,29 @@
+#' @title control parameters
+#' @author Maciej Beręsewicz
+#'
+#' @description
+#' \code{control_calib} is function that contains control parameters for \code{joint_calib_create_matrix}
+#'
+#' @param interpolation type of interpolation: \code{logit} or \code{linear},
+#' @param logit_const constant for \code{logit} interpolation,
+#' @param survey_sparse whether to use sparse matrices via \code{Matrix} package in [survey::grake()] (currently not supported),
+#' @param ebal_constraint_tolerance This is the tolerance level used by ebalance to decide if the moments in the reweighted data are equal to the target moments (see [ebal::ebalance()]),
+#' @param ebal_print_level Controls the level of printing: 0 (normal printing), 2 (detailed), and 3 (very detailed) (see [ebal::ebalance()]).
+#'
+#' @return a list with parameters
+#'
+#' @export
+control_calib <- function(interpolation = c("logit", "linear"),
+                          logit_const = -1000,
+                          survey_sparse = FALSE,
+                          ebal_constraint_tolerance=1,
+                          ebal_print_level=0) {
+
+  if (missing(interpolation)) interpolation <- "logit"
+
+  return(list(interpolation = interpolation,
+              logit_const = logit_const,
+              survey_sparse = survey_sparse,
+              ebal_constraint_tolerance = ebal_constraint_tolerance,
+              ebal_print_level = ebal_print_level))
+}