diff --git a/DESCRIPTION b/DESCRIPTION index e69cede..674b8f4 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -2,29 +2,24 @@ Package: gustave Type: Package Title: A User-Oriented Statistical Toolkit for Analytical Variance Estimation -Depends: R(>= 3.3.0) -Imports: Matrix, methods, stats, MASS -Suggests: testthat, sampling, vardpoor -Version: 0.3.0 +Depends: R(>= 3.2.5) +Imports: methods, utils, stats, Matrix +Suggests: testthat, sampling, magrittr, tibble, dplyr, data.table +Version: 0.4.0 Authors@R: person("Martin", "Chevalier", role = c("aut", "cre", "cph"), email = "martin.chevalier@insee.fr") Author: Martin Chevalier [aut, cre, cph] Maintainer: Martin Chevalier URL: https://github.com/martinchevalier/gustave BugReports: https://github.com/martinchevalier/gustave/issues -Description: - Provides a toolkit for analytical variance estimation in survey sampling. Apart from - the implementation of standard variance estimators, its main feature is to help the - sampling expert produce easy-to-use variance estimation "wrappers", where systematic - operations (linearization, domain estimation) are handled in a consistent and transparent - way for the end user. +Description: Provides a toolkit for analytical variance estimation in survey sampling. Apart from the implementation of standard variance estimators, its main feature is to help the sampling expert produce easy-to-use variance estimation "wrappers", where systematic operations (linearization, domain estimation) are handled in a consistent and transparent way. License: GPL-3 Collate: 'data.R' 'utils.R' 'define_variance_wrapper.R' - 'variance_function.R' 'define_linearization_wrapper.R' - 'linearization_wrapper_standard.R' + 'variance_function.R' 'define_statistic_wrapper.R' + 'standard_statistic_wrapper.R' 'qvar.R' Encoding: UTF-8 LazyData: true -RoxygenNote: 6.0.1 +RoxygenNote: 6.1.0 NeedsCompilation: no -Packaged: 2018-06-05 10:13:41 UTC; gc004y +Packaged: 2018-08-30 14:41:40 UTC; root Repository: CRAN -Date/Publication: 2018-06-06 10:57:54 UTC +Date/Publication: 2018-08-31 08:50:03 UTC diff --git a/MD5 b/MD5 index 92cc49e..e061a3e 100644 --- a/MD5 +++ b/MD5 @@ -1,31 +1,41 @@ -8f5c2260b6ccb083123baf75b8257fec *DESCRIPTION -91880814741925e785cae8f2c41e08b2 *NAMESPACE -51060c716262ae470bbbf42430a795fb *NEWS.md -ebb51fb5b0e3f9b27e10eef41b7aae47 *R/data.R -53633df58319bff6c34cd221e3950212 *R/define_linearization_wrapper.R -102424bb8ef6c38253a3157b99e3baf6 *R/define_variance_wrapper.R -a57e30f0051232e5bb29773062153416 *R/linearization_wrapper_standard.R -750d4e4f6a16bb1cd3a84a432c894f8a *R/utils.R -c96c290e003489983e7e4c02876fc183 *R/variance_function.R -4d8d99c908b5ac4039581f7fec3d2292 *README.md -0870fcf3d5172672081245da1c05b9c9 *data/ict_pop.RData -8a70ef048c7396290cb84a16302e13a4 *data/ict_sample.RData -49e1f4f2f2fdb31fa882fb621ea6bcdd *data/ict_survey.RData -2a9261d3e2c6b9a2366d014ea8b0c09f *man/add0.Rd -69ef4e74da05b417c7d53d9525f65287 *man/define_linearization_wrapper.Rd -39762731288e7ddde26e2e2016837f70 *man/define_variance_wrapper.Rd -288191ccf77d2678afdb4c8fbe22b03a *man/ict_pop.Rd -d5ba39cc41aa667aa24a4d1f04d3c351 *man/ict_sample.Rd -548f610bd1fc82fb9c557f709b8f5f40 *man/ict_survey.Rd -cf1700c6e2420367d45321b063c41296 *man/linearization_wrapper_standard.Rd -035009d9d873192075d924da70a52ac4 *man/rescal.Rd -006505dd3d3bf9ae2e17061e62819cee *man/sumby.Rd -74859be7486e481b83e3fa777228119c *man/varDT.Rd -bde6a9e3b7e419a98a86f379d0fa2f90 *man/varSYG.Rd -f5b5174836ed01a616bdcb59d427fe65 *man/var_pois.Rd -b64a36634ff32b6814849e4d3f104669 *tests/testthat.R -8fcc043daaecff41d7956b33733fcd28 *tests/testthat/test_data.R -c429c11490868e5918bbc12f30020ea5 *tests/testthat/test_define_variance_wrapper.R -96716fb61cb68d183d90356a35deda9c *tests/testthat/test_linearization_wrapper_standard.R -5f05ba731848289a23bd61436db98acb *tests/testthat/test_utils.R -23b39bbb1d0d25eefd76cad2c4249212 *tests/testthat/test_variance_function.R +e50d6c394adfd82285b190ea9fb3ad6e *DESCRIPTION +c9e9157b0cb66ec5ebc367b650c81190 *NAMESPACE +a4ee5937ba2cf7ac65026d5c2b92acc9 *NEWS.md +6a8f960eb5a01db80959beb76a096f38 *R/data.R +a74e31826d5d9065b413f35b63bd1ef9 *R/define_statistic_wrapper.R +9c4e35b8eb90377d2bc552a286791314 *R/define_variance_wrapper.R +802e1bcb5338e937df7ad2e466bef499 *R/qvar.R +28b80238301c28129de7783228cc8b4f *R/standard_statistic_wrapper.R +a024fa61acef37fb9cd8f6a64b14f2f6 *R/utils.R +b88c948cb8f803decaae2a2f105529e0 *R/variance_function.R +5b2a5139e9376b9e23c52590e77b126e *README.md +8c938e5f0102972669f4ef1ec452bcc5 *data/ict_pop.RData +f40387363e8e3a4a79446ce9ec1e381f *data/ict_sample.RData +b2ff22127820bcf56be7e808af58be8f *data/ict_survey.RData +48b98cc892d1cdf79971ebabeb3cb931 *data/lfs_samp_area.RData +bf1e868ad87514b21cb573a252b6a572 *data/lfs_samp_dwel.RData +c8e1be9f23795cc864502d4b89c5436b *data/lfs_samp_ind.RData +b0ac8fecb046a12de4139026cd1e0b16 *man/add_zero.Rd +70cc63036acb1a298bbc70cb3c22b7f4 *man/define_statistic_wrapper.Rd +26ab3ef96df904b6d2d449a563545734 *man/define_variance_wrapper.Rd +cee1c4c83cf4453018d5cfcdeb636235 *man/ict_pop.Rd +f5797bae17bfa8aa60b7581faf8fbced *man/ict_sample.Rd +0c0fc4cbd9f5dba08c4ddc0b700d0066 *man/ict_survey.Rd +a18753f69aeb67136ae9d96442ff1d76 *man/lfs_samp_area.Rd +84aa8214447918f1191a9c9ad514a7b4 *man/lfs_samp_dwel.Rd +2701efe68030805889ac08ffd36df495 *man/lfs_samp_ind.Rd +c2dbd6245a7ad53d7a414b3849325762 *man/qvar.Rd +847cec06c2e5ba8a06da7b3d758b622e *man/res_cal.Rd +539e4dd6b2d06bca77d5050e7ef91550 *man/standard_statistic_wrapper.Rd +c6a83fc3ef716a586a19dd91fa9c10d1 *man/sum_by.Rd +0e2158131c7ef7576132363de79d5092 *man/varDT.Rd +3a7fad3be9e01082f3f2a0cd530acc11 *man/varSYG.Rd +77cf51f37b02b29982179544a006de94 *man/var_pois.Rd +267fa81060b9e8ccee84f8ef2a68a150 *tests/testthat.R +4ad05200db3f34399016f49a02659bd9 *tests/testthat/test_data.R +c456b64214b63c07af9fb326d75b19de *tests/testthat/test_define_statistic_wrapper.R +6b7b20cf33e4118270dc38dde0162d59 *tests/testthat/test_define_variance_wrapper.R +c4f2c0b27affc300dac2abd048e353bd *tests/testthat/test_qvar.R +a013785a23d1f9c8914db3d18e889f24 *tests/testthat/test_standard_statistic_wrapper.R +9d17c33baa22945661e77a1168bc4ccb *tests/testthat/test_utils.R +bd6e46af36af4fcffc4980a2fcd94d33 *tests/testthat/test_variance_function.R diff --git a/NAMESPACE b/NAMESPACE index d41d784..506d1ba 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,12 +1,13 @@ -# Generated by roxygen2: do not edit by hand - -export(add0) -export(define_linearization_wrapper) -export(define_variance_wrapper) -export(rescal) -export(sumby) -export(varDT) -export(varSYG) -export(var_pois) -export(var_srs) -import(Matrix) +# Generated by roxygen2: do not edit by hand + +export(add_zero) +export(define_statistic_wrapper) +export(define_variance_wrapper) +export(qvar) +export(res_cal) +export(sum_by) +export(varDT) +export(varSYG) +export(var_pois) +export(var_srs) +import(Matrix) diff --git a/NEWS.md b/NEWS.md index b7bd7a0..8091d73 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,42 +1,68 @@ - -# 0.3.0 - -- Simulated data added -- Significant increase of unit tests -- Documentation completed -- Simplification of the structure of the main object processed by the variance wrapper -- Removal of unnecessary arguments in linearization wrappers -- Removal of the linerization wrappers for the Laeken indicators based on the vardpoor package (better integration in a future release) -- Preparation for a first CRAN release - -# 0.2.7 - -- Now linearization with all data parameters set to NULL are discarded from the estimation. - -# 0.2.6 - -- Bug fix: evaluation of variables can occur either in the data argument or in the evaluation environment (envir argument) - -# 0.2.3-0.2.5 - -- Several attempts to output more metadata from linearization functions. -- At the end : ratio() gains two metadata slots, est_num and est_denom - -# 0.2.2 - -- Minor bug fixes - -# 0.2.1 - -- Beginning of the documentation -- Renaming of numerous functions and arguments -- Change the precalc structure in varDT -- Normalize the treatment of weights -- New linearization wrappers: gini() and arpr() - -# 0.1.3-0.1.7 - -- No more dependency to package pryr -- Add the generalized inverse in `varDT` -- Other bug fixes - +# 0.4.0 + +- Breaking: Heavy remanufacturing of `define_variance_wrapper` + + * New: `technical_data` argument offers a more consistent way to include technical data within the enclosing environment of the wrapper. `objects_to_include` is kept for non-data objects (such as additional statistic wrappers) or advanced customization. + * New: `technical_param` argument offers a more convenient way to specify default values for parameters used by the variance function. + * New: `reference_weight` replaces `default$weight`. This means that the reference weight used for point estimation and linearization is set while defining the variance wrapper and not at run-time. + * Deprecated: `stat`, which was a remain of an early implementation of linearization functions, is not a parameter of the variance wrappers anymore. Its purpose (to apply a given variance wrapper to several variables without having to type the name of the linearization wrapper) is now covered by the standard evaluation capabilities of statistic wrappers (see below). + * Deprecated: `default` is replaced by `default_id`, as `default$weight` and `default$stat` are no longer needed. As for `default$alpha`, its value is set to 0.05 and cannot be changed anymore while defining the variance wrapper (as this can easily be done afterwards using `formals<-`). + * Deprecated: `objects_to_include_from` + +- Breaking: Rebranding and heavy remanufacturing of `define_statistic_wrapper` (previously known as `define_linearization_wrapper`), added support for standard evaluation (see `define_variance_wrapper` examples). + +- New: the `qvar` function allows for a straigthforward variance estimation in common cases (stratified simple random sampling with non-response through reweighting and calibration) and performs both technical and methodological checks. + +- Some normalization in function names: `add0` becomes `add_zero`, `sumby` becomes `sum_by`, `rescal` becomes `res_cal` + +- Example data: calibration variables in ict_sample instead of ict_survey, new LFS example data + +- Significant increase of unit tests + + + +# 0.3.1 + +- Hotfix: Add calibrated weights to `define_variance_wrapper` example. + +# 0.3.0 + +- Simulated data added +- Significant increase of unit tests +- Documentation completed +- Simplification of the structure of the main object processed by the variance wrapper +- Removal of unnecessary arguments in linearization wrappers +- Removal of the linerization wrappers for the Laeken indicators based on the vardpoor package (better integration in a future release) +- Preparation for a first CRAN release + +# 0.2.7 + +- Now linearization with all data parameters set to NULL are discarded from the estimation. + +# 0.2.6 + +- Bug fix: evaluation of variables can occur either in the data argument or in the evaluation environment (envir argument) + +# 0.2.3-0.2.5 + +- Several attempts to output more metadata from linearization functions. +- At the end : ratio() gains two metadata slots, est_num and est_denom + +# 0.2.2 + +- Minor bug fixes + +# 0.2.1 + +- Beginning of the documentation +- Renaming of numerous functions and arguments +- Change the precalc structure in varDT +- Normalize the treatment of weights +- New linearization wrappers: gini() and arpr() + +# 0.1.3-0.1.7 + +- No more dependency to package pryr +- Add the generalized inverse in `varDT` +- Other bug fixes + diff --git a/R/data.R b/R/data.R index 22386ae..074603b 100644 --- a/R/data.R +++ b/R/data.R @@ -1,71 +1,134 @@ - - -#' Sampling frame of the Information and communication technologies (ICT) -#' survey in the information and communication sector (NACE rev 2 J section) -#' -#' A (simulated) dataset containing basic identification information and -#' auxiliary variables for the sampling of the Information and communication -#' technologies (ICT) survey in the information and communication sector -#' (NACE rev 2 J section). -#' -#' @format A data frame with 7670 observations and 4 variables: -#' \describe{ -#' \item{firm_id}{identifier of the firm} -#' \item{division}{identifier of the economic sub-sector} -#' \item{employees}{number of employees} -#' \item{turnover}{firm turnover, in thousand euros} -#' } -#' -#' @seealso \code{\link{ict_sample}} \code{\link{ict_survey}} - -"ict_pop" - - -#' Sample of the Information and communication technologies (ICT) -#' survey in the information and communication sector (NACE rev 2 J section) -#' -#' A (simulated) dataset containing sampling information about the sample -#' of the Information and communication technologies (ICT) -#' survey in the information and communication sector (NACE rev 2 J section) -#' -#' @format A data frame with 650 observations and 8 variables: -#' \describe{ -#' \item{firm_id}{identifier of the firm} -#' \item{division}{identifier of the economic sub-sector} -#' \item{employees}{number of employees} -#' \item{turnover}{firm turnover, in euros} -#' \item{w_sample}{sampling weight} -#' \item{resp}{boolean indicating whether the firm did respond to the survey or not} -#' \item{hrg}{homogeneous response group used for the unit non-response correction} -#' \item{response_prob_est}{response probability of the unit estimated using homogeneous response groups} -#' \item{w_nr}{weight after unit non-response correction} -#' } -#' -#' @seealso \code{\link{ict_pop}} \code{\link{ict_survey}} -"ict_sample" - -#' Survey data of the Information and communication technologies (ICT) -#' survey in the information and communication sector (NACE rev 2 J section) -#' -#' A (simulated) dataset containing calibration and survey variables of the respondents -#' to the Information and communication technologies (ICT) -#' survey in the information and communication sector (NACE rev 2 J section) -#' -#' @format A data frame with 612 observations and 25 variables: -#' \describe{ -#' \item{firm_id}{identifier of the firm} -#' \item{division}{identifier of the economic sub-sector} -#' \item{employees}{number of employees} -#' \item{turnover}{firm turnover, in euros} -#' \item{w_sample}{sampling weight} -#' \item{w_nr}{weight after unit non-response correction} -#' \item{N_58, N_59, N_60, N_61, N_62, N_63, turnover_58, turnover_59, turnover_60, turnover_61, turnover_62, turnover_63}{calibration variables (number of firms and turnover broken down by economic sub-sector)} -#' \item{w_calib}{calibrated weight} -#' \item{speed_quanti, speed_quanti_NA}{internet connection speed of the firm in Mbits, without or with missing values} -#' \item{speed_quali, speed_quali_NA}{internet connection speed of the firm recoded in classes, without or with missing values} -#' \item{big_data, big_data_NA}{use of big data analytics within the firm, without or with missing values} -#' } -#' -#' @seealso \code{\link{ict_pop}} \code{\link{ict_sample}} - -"ict_survey" + + +#' Sampling frame of the Information and communication technologies (ICT) +#' survey +#' +#' A (simulated) dataset containing basic identification information and +#' auxiliary variables for the sampling of the Information and communication +#' technologies (ICT) survey in the information and communication sector +#' (NACE rev 2 J section). +#' +#' @format A data frame with 7670 observations and 5 variables: +#' \describe{ +#' \item{firm_id}{identifier of the firm} +#' \item{division}{identifier of the economic sub-sector} +#' \item{employees}{number of employees} +#' \item{turnover}{firm turnover, in thousand euros} +#' \item{strata}{stratification variable} +#' } +#' +#' @seealso \code{\link{qvar}}, \code{\link{ict_sample}}, \code{\link{ict_survey}} + +"ict_pop" + + +#' Sample of the Information and communication technologies (ICT) +#' survey +#' +#' A (simulated) dataset containing sampling information about the sample +#' of the Information and communication technologies (ICT) +#' survey in the information and communication sector (NACE rev 2 J section) +#' +#' @format A data frame with 650 observations and 8 variables: +#' \describe{ +#' \item{firm_id}{identifier of the firm} +#' \item{division}{identifier of the economic sub-sector} +#' \item{employees}{number of employees} +#' \item{turnover}{firm turnover, in euros} +#' \item{strata}{stratification variable} +#' \item{w_sample}{sampling weight} +#' \item{scope}{boolean indicating whether the firm did belong to the scope of the survey or not} +#' \item{resp}{boolean indicating whether the firm did respond to the survey or not} +#' \item{nrc}{boolean indicating whether the firm did take part in the non-response correction process or not} +#' \item{hrg}{homogeneous response group used for the non-response correction} +#' \item{response_prob_est}{response probability of the unit estimated using homogeneous response groups} +#' \item{w_nrc}{weight after unit non-response correction} +#' \item{calib}{boolean indicating whether the firm was integrated in the calibration process or not (\code{TRUE} for all responding units)} +#' \item{N_58, N_59, N_60, N_61, N_62, N_63, turnover_58, turnover_59, turnover_60, turnover_61, turnover_62, turnover_63}{calibration variables (number of firms and turnover broken down by economic sub-sector)} +#' \item{w_calib}{calibrated weight} +#' \item{dissemination}{boolean indicating whether the unit appears in the dissemination file} +#' } +#' +#' @seealso \code{\link{qvar}}, \code{\link{ict_pop}}, \code{\link{ict_survey}} +"ict_sample" + +#' Survey data of the Information and communication technologies (ICT) +#' survey +#' +#' A (simulated) dataset containing calibration and survey variables of the respondents +#' to the Information and communication technologies (ICT) +#' survey in the information and communication sector (NACE rev 2 J section) +#' +#' @format A data frame with 612 observations and 11 variables: +#' \describe{ +#' \item{firm_id}{identifier of the firm} +#' \item{division}{identifier of the economic sub-sector} +#' \item{employees}{number of employees} +#' \item{turnover}{firm turnover, in euros} +#' \item{w_calib}{calibrated weight} +#' \item{speed_quanti, speed_quanti_NA}{internet connection speed of the firm in Mbps, without or with missing values} +#' \item{speed_quali, speed_quali_NA}{internet connection speed of the firm recoded in classes, without or with missing values} +#' \item{big_data, big_data_NA}{use of big data analytics within the firm, without or with missing values} +#' } +#' +#' @seealso \code{\link{qvar}}, \code{\link{ict_pop}}, \code{\link{ict_sample}} + +"ict_survey" + + + +#' Sample of areas in the Labour force survey +#' +#' A (simulated) dataset containing information about 4 geographical +#' areas (about 120 dwellings each) sampled for the labour force survey. +#' +#' @format A data frame with 4 observations and 3 variables: +#' \describe{ +#' \item{id_area}{identifier of the area} +#' \item{income}{total annual income of the area in thousand euros (from income registry)} +#' \item{pik_area}{first-order inclusion probability of the area (proportional to the number of dwellings in the area)} +#' } +#' +#' @seealso \code{\link{define_variance_wrapper}}, \code{\link{lfs_samp_dwel}}, \code{\link{lfs_samp_ind}} + +"lfs_samp_area" + +#' Sample of dwellings in the Labour force survey +#' +#' A (simulated) dataset containing information about 80 dwellings +#' sampled for the Labour force survey (in the 4 areas described +#' in \code{\link{lfs_samp_area}}). +#' +#' @format A data frame with 80 observations and 6 variables: +#' \describe{ +#' \item{id_dwel}{identifier of the dwelling} +#' \item{id_area}{identifier of the area} +#' \item{income}{total annual income of the dwelling in thousand euros (from income registry)} +#' \item{pik_area}{first-order inclusion probability of the area} +#' \item{pik_dwel}{first-order inclusion probability of the dwelling within the area (20 dwelling sampled per area)} +#' \item{pik}{first-order inclusion probability of the dwelling} +#' } +#' +#' @seealso \code{\link{define_variance_wrapper}}, \code{\link{lfs_samp_area}}, \code{\link{lfs_samp_ind}} + +"lfs_samp_dwel" + +#' Sample of individuals in the Labour force survey +#' +#' A (simulated) dataset containing information about 157 individuals +#' sampled for the Labour force survey (all members of the 80 dwellings +#' described in \code{\link{lfs_samp_dwel}}). It also contains the +#' unemployment status extracted from the survey file (no non-response). +#' +#' @format A data frame with 157 observations and 5 variables: +#' \describe{ +#' \item{id_ind}{identifier of the individual} +#' \item{id_dwel}{identifier of the dwelling} +#' \item{income}{total annual income of the individual in thousand euros (from income registry)} +#' \item{unemp}{unemployment status} +#' \item{sampling_weight}{sampling weight of the individual (inverse of the first-order inclusion probability of the dwelling)} +#' } +#' +#' @seealso \code{\link{define_variance_wrapper}}, \code{\link{lfs_samp_area}}, \code{\link{lfs_samp_dwel}} + +"lfs_samp_ind" \ No newline at end of file diff --git a/R/define_linearization_wrapper.R b/R/define_linearization_wrapper.R deleted file mode 100644 index 8a4401d..0000000 --- a/R/define_linearization_wrapper.R +++ /dev/null @@ -1,293 +0,0 @@ - -#' Define a linearization wrapper - -#' @description Given a linearization \emph{function} (specific to an -#' estimator), \code{define_linearization_wrapper} defines a -#' linearization \emph{wrapper} to be used together with -#' \code{\link[=define_variance_wrapper]{variance estimation wrappers}} -#' in order to make variance estimation easier. -#' This function is intended for \strong{advanced use only} (see Details), -#' standard linearization wrappers are included in the gustave package (see -#' \code{\link[=linearization_wrapper_standard]{standard linearization wrappers}}) -#' -#' @param linearization_function An R function with input the quantities -#' used in the linearization formula and with output a list with two -#' named element: \itemize{\item \code{lin}: a list of numerical vectors (most -#' of the time, only 1) which correspond to the value of the linearized -#' variable \item \code{metadata}: a list of metadata to be used by the display -#' function (see \code{display_function} argument), including (for the -#' standard display function) \code{est} for the point-estimate and -#' \code{n} for the number of observations used in the estimation.} -#' @param arg_type A named list with three character vectors describing -#' the type of each argument of \code{linearization_function}: \itemize{ -#' \item \code{data}: data argument(s), numerical vector(s) to be used in the -#' linearization formula \item \code{weight}: weight argument, numerical vector -#' to be used as row weights in the linearization formula \item \code{param}: -#' parameters, non-data arguments (most of the time boolean) to be used to -#' control some aspect of the linearization formula} -#' @param allow_factor A logical vector of length 1 (\code{FALSE} by default) -#' indicating whether factor variable are accepted as-is by the linearization -#' wrappers. This should be the case when the linearization function only has -#' one data argument (e.g. \code{total} or \code{mean} linearization formulae). -#' @param arg_not_affected_by_domain A character vector indicating the (data) -#' arguments which should not be affected by domain-splitting. Such parameters -#' may appear in some complex linearization formula, for instance when the -#' At-Risk of Poverty Rate (ARPR) is estimated by region but with a poverty -#' line calculated at the national level. -#' @param display_function An R function which produces, for each variance -#' estimation, the data.frame row to be displayed by the variance estimation -#' wrapper. It uses three arguments: -#' \itemize{\item \code{var} the estimated variance \item \code{metadata} the -#' metadata associated with the estimation, especially the one outputted by -#' \code{linearization_function} (e.g. \code{est}, \code{n}) \item \code{alpha} -#' the level for the construction of confidence intervals (at execution time, -#' its value is taken from the \code{alpha} argument of the variance wrapper.)} -#' The default display function (\code{standard_display}) uses standard metadata -#' to display usual variance indicator (variance, standard deviation, coefficient -#' of variation, confidence interval) broken down by linearization wrapper, domain -#' (if any) and level (if the variable is a factor, see argument \code{allow_factor}). -#' -#' @details When the estimator is not the estimator of a total, the application of -#' analytical variance estimation formulae developed for the estimator of a total -#' is not straightforward (Deville, 1999). An asymptotically unbiased variance -#' estimator can nonetheless be obtained if the estimation of variance is performed -#' on a variable obtained from the original data through a linearization step. -#' -#' \code{define_linearization_wrapper} is the function used to create, given -#' a linearization \emph{function} implementing a given linearization -#' \emph{formula}, a linearization \emph{wrapper} which can be used together -#' with a variance wrapper. -#' -#' Linearization wrappers are quite flexible tools -#' to apply a variance function to an estimator requiring a linearization step -#' (e.g. all estimators except the estimator of a total) with virtually no -#' additional complexity for the end-user. To some extent, linearization wrappers -#' can be seen as ggplot2 \code{geom_} and \code{stat_} functions: they help -#' the end-user in writing down what he or she wants without having to go too -#' deep into the details of the corresponding layers. -#' -#' \code{\link[=linearization_wrapper_standard]{standard linearization wrappers}} -#' are included within the gustave package and automatically added -#' to the variance estimation wrappers. New linearization wrappers can be defined -#' using the \code{define_linearization_wrapper} and then explicitly added to the -#' variance estimation wrappers using the \code{objects_to_include} argument. -#' -#' @return A function to be used within a variance estimation wrapper to perform -#' a specific linearization (see examples). Its formals are the ones of -#' \code{linearization_function} with the addition of \code{by} and \code{where} -#' (for domain estimation, set to \code{NULL} by default). -#' -#' @author Martin Chevalier -#' -#' @seealso \code{\link[=linearization_wrapper_standard]{standard linearization wrappers}} -#' \code{\link{define_variance_wrapper}} -#' -#' @references -#' Deville J.-C. (1999), "Variance estimation for complex statistics and -#' estimators: linearization and residual techniques", \emph{Survey Methodology}, -#' 25:193–203 -#' -#' @examples ### Example from the Information and communication technologies (ICT) survey -#' -#' # The subset of the (simulated) ICT survey has the following features: -#' # - stratified one-stage sampling design of 650 firms; -#' # - 612 responding firms, non-response correction through reweighting -#' # in homogeneous response groups based on economic sub-sector and turnover; -#' # - calibration on margins (number of firms and turnover broken down -#' # by economic sub-sector). -#' -#' # Step 1 : Dummy variance wrapper -#' # Note : see define_variance_wrapper() for a more -#' # realistic variance function and examples. -#' variance_wrapper <- define_variance_wrapper( -#' variance_function = function(y) abs(colSums(y)), -#' reference_id = ict_survey$firm_id, -#' default = list(id = "firm_id", weight = "w_calib") -#' ) -#' variance_wrapper(ict_survey, total(speed_quanti)) -#' -#' # Step 2 : Redefine the mean linearization wrapper -#' # The mean() linearization wrapper defined in the gustave -#' # package is bulit on top of the ratio() linearization wrapper. -#' variance_wrapper(ict_survey, mean(speed_quanti)) -#' -#' # Let's redefine it directly from the formula found for instance -#' # in (Caron, Deville, Sautory, 1998) and without handling NA -#' # values -#' mean2 <- define_linearization_wrapper( -#' linearization_function = function(y, weight){ -#' est <- sum(y * weight) / sum(weight) -#' lin <- (y - est) / sum(weight) -#' list( -#' lin = list(lin), -#' metadata = list(est = est, n = length(y)) -#' ) -#' }, -#' arg_type = list(data = "y", weight = "weight"), -#' allow_factor = TRUE -#' ) -#' variance_wrapper(ict_survey, mean(speed_quanti), mean2(speed_quanti)) -#' -#' @export - -define_linearization_wrapper <- function(linearization_function, - arg_type, - allow_factor = FALSE, - arg_not_affected_by_domain = NULL, - display_function = standard_display -){ - - # Step 0 : Control arguments consistency - inconsistent_arg <- list( - in_arg_type_not_in_linearization_function = setdiff(unlist(arg_type), names(formals(linearization_function))), - in_linearization_function_not_in_arg_type = setdiff(names(formals(linearization_function)), unlist(arg_type)), - in_arg_not_affected_by_domain_not_in_linearization_function = setdiff(arg_not_affected_by_domain, names(formals(linearization_function))) - ) - if(length(unlist(inconsistent_arg)) > 0) stop( - "Some arguments are inconsistent:", - if(length(inconsistent_arg[[1]]) > 0) paste("\n -", paste(inconsistent_arg[[1]], collapse = ", "), "in arg_type but not in linearization_function arguments") else "", - if(length(inconsistent_arg[[2]]) > 0) paste("\n -", paste(inconsistent_arg[[2]], collapse = ", "), "in linearization_function arguments but not in arg_type") else "", - if(length(inconsistent_arg[[3]]) > 0) paste("\n -", paste(inconsistent_arg[[3]], collapse = ", "), "in arg_not_affected_by_domain but not in linearization_function arguments") else "" - ) - if(is.null(arg_type$weight)) - stop("A weight argument must be provided in order to create a linearization wrapper.") - - # Step 1 : Create the linearization wrapper - linearization_wrapper <- function(by = NULL, where = NULL, ...){ - - # Step 1.1 : Capture and expand the call - call <- match.call(expand.dots = TRUE) - call_display_arg <- c(1, which(names(call) %in% setdiff(names(formals(sys.function())), "...") & !sapply(call, is.null))) - call_display <- paste(deparse(call[call_display_arg], width.cutoff = 500L), collapse = "") - call_list <- c(as.list(call)[-1], list( - allow_factor = allow_factor, arg_type = arg_type, - arg_not_affected_by_domain = arg_not_affected_by_domain, - call = call_display - )) - - # Step 1.2 : Proceeed to standard preparation - d <- do.call(standard_preparation, call_list) - if(is.null(d)) return(NULL) - - # Step 1.3 : Evaluate the linearization functions - d <- lapply(d, function(i){ - tmp <- do.call(linearization_function, with(i, c(data, weight, param))) - i$metadata <- c(i$metadata, tmp$metadata) - c(i, list( - linearization_function = linearization_function, - lin = tmp$lin, - display_function = display_function - )) - }) - - return(d) - } - - # Step 2 : Modify linearization_wrapper formals - formals(linearization_wrapper) <- c( - formals(linearization_function)[setdiff(names(formals(linearization_function)), arg_type$weight)], - formals(linearization_wrapper) - ) - - # Step 3 : Include objects in linearization_wrapper enclosing environment - e <- new.env(parent = globalenv()) - assign_all(objects = "standard_preparation", to = e, from = asNamespace("gustave")) - assign_all(objects = c("linearization_function", "arg_type", "allow_factor", "arg_not_affected_by_domain", "display_function"), to = e, from = environment()) - linearization_wrapper <- change_enclosing(linearization_wrapper, envir = e) - - structure(linearization_wrapper, class = c("function", "gustave_linearization_wrapper")) - -} - -standard_preparation <- function(..., - by = NULL, where = NULL, - data, label, evaluation_envir, execution_envir, - allow_factor, arg_type, arg_not_affected_by_domain, - call - ){ - - # Step 1 : Evaluation - eval_data <- eval(data, evaluation_envir) - expr <- eval(substitute(alist(...))) - d <- list(list( - data = lapply(expr[names(expr) %in% arg_type$data], eval, envir = eval_data, enclos = evaluation_envir) - , weight = lapply(expr[names(expr) %in% arg_type$weight], eval, envir = eval_data, enclos = evaluation_envir) - , param = if(any(names(expr) %in% arg_type$param)) expr[names(expr) %in% arg_type$param] else NULL - )) - if(all(sapply(d[[1]]$data, is.null))) return(NULL) - - # Step 2 : Handling factors and character variables in data - fac <- sapply(d[[1]]$data, function(i) is.character(i) || is.factor(i)) - if(sum(fac) > 0){ - if(allow_factor && length(d[[1]]$data) == 1){ - tmp <- d[[1]]$data[[1]] - if(is.character(tmp)) tmp <- as.factor(tmp) - tmp <- droplevels(tmp) - levels <- levels(tmp) - tmp2 <- stats::model.matrix(~ . -1, data = stats::model.frame(~ ., data = tmp)) - tmp3 <- matrix(NA, nrow = NROW(tmp), ncol = length(levels)) - tmp3[as.integer(rownames(tmp2)), ] <- tmp2 - d <- lapply(1:length(levels), function(i){ - list( - data = stats::setNames(list(c(tmp3[, i])), names(d[[1]]$data)) - , weight = d[[1]]$weight, param = d[[1]]$param - , metadata = list(mod = levels[i]) - ) - }) - }else stop("Character or factor variables aren't allowed.", call. = FALSE) - }else d[[1]]$metadata$mod <- NA - - # Step 3 : by and where arguments preparation - n <- length(d[[1]]$data[[1]]) - byNULL <- is.null(substitute(by)) - by <- if(!byNULL){ - as.factor(eval(substitute(by), eval_data)) - }else{ - tmp <- rep(1L, n) - levels(tmp) <- "1" - class(tmp) <- "factor" - tmp - } - if(!is.null(substitute(where))){ - by[!as.logical(eval(substitute(where), eval_data))] <- NA - by <- droplevels(by) - } - if(sum(!is.na(by)) == 0) - stop("by and/or where arguments exclude all observations.", call. = FALSE) - - # Step 4 : Splitting across domains - bypos <- split(1:n, by, drop = TRUE) - d <- unlist(lapply(seq_along(bypos), function(i){ - lapply(d, function(j) list( - data = lapply(stats::setNames(seq_along(j$data), names(j$data)), function(k){ - if(names(j$data)[k] %in% arg_not_affected_by_domain) j$data[[k]] else j$data[[k]][bypos[[i]]] - }) - , weight = lapply(stats::setNames(seq_along(j$weight), names(j$weight)), function(k){ - if(names(j$weight)[k] %in% arg_not_affected_by_domain) j$weight[[k]] else j$weight[[k]][bypos[[i]]] - }), param = j$param - , metadata = c(j$metadata, list( - by = if(byNULL) NA else names(bypos)[i], - bypos = bypos[[i]], - label = label, - call = call - )))) - }), recursive = FALSE) - - d - -} - -standard_display <- function(var, metadata, alpha){ - d <- as.data.frame(metadata[c("label", "call", "mod", "by")]) - if(!is.null(metadata$n)) d$n <- metadata$n - d$est <- metadata$est - d$variance <- var[[1]] - d$std <- sqrt(d$variance) - d$cv <- d$std * 100 / d$est - d$lower <- d$est - stats::qnorm(1-alpha/2)*d$std - d$upper <- d$est + stats::qnorm(1-alpha/2)*d$std - d -} -standard_display <- change_enclosing(standard_display, globalenv()) - - diff --git a/R/define_statistic_wrapper.R b/R/define_statistic_wrapper.R new file mode 100644 index 0000000..381761d --- /dev/null +++ b/R/define_statistic_wrapper.R @@ -0,0 +1,302 @@ + +#' Define a statistic wrapper + +#' @description \code{define_statistic_wrapper} defines +#' statistic \emph{wrappers} to be used together with +#' \code{\link[=define_variance_wrapper]{variance estimation wrappers}}. +#' A statistic wrapper produces both the point estimator and the +#' linearized variable associated with a given statistic to estimate +#' variance on (Deville, 1999). \code{define_statistic_wrapper} is intended +#' for \strong{advanced use only}, standard statistic wrappers are included +#' in the gustave package (see \code{\link[=standard_statistic_wrapper]{standard statistic wrappers}}) +#' +#' @param statistic_function An R function specific to the statistic to +#' calculate. It should produce at least the point estimator and the +#' linearized variable associated with the statistic (see Details). +#' @param arg_type A named list with three character vectors describing +#' the type of each argument of \code{statistic_function} (see Details). +#' @param arg_not_affected_by_domain A character vector indicating the +#' arguments which should not be affected by domain-splitting. Such parameters +#' may appear in some complex linearization formula, for instance when the +#' At-Risk of Poverty Rate (ARPR) is estimated by region but with a poverty +#' line calculated at the national level. +#' @param display_function An R function which produces, for each variance +#' estimation, the data.frame to be displayed by the variance estimation +#' wrapper. The default display function (\code{standard_display}) uses +#' standard metadata to display usual variance indicator (point estimate, +#' variance, standard deviation, coefficient of variation, confidence interval) +#' broken down by statistic wrapper, domain (if any) and level (if the variable +#' is a factor). +#' +#' @details When the statistic to estimate is not a total, the application of +#' analytical variance estimation formulae developed for the estimator of a total +#' is not straightforward (Deville, 1999). An asymptotically unbiased variance +#' estimator can nonetheless be obtained if the estimation of variance is performed +#' on a variable obtained from the original data through a linearization step. +#' +#' \code{define_statistic_wrapper} is the function used to create, for a +#' given statistic, an easy-to-use function which calculates both the point +#' estimator and the linearized variable associated with the statistic. These +#' operations are implemented by the \code{statistic_function}, which can have +#' any needed input (for example \code{num} and \code{denom} for a ratio +#' estimator) and should output a list with at least two named elements: \itemize{ +#' \item \code{point}: the point estimator of the statistic +#' \item \code{lin}: the linearized variable to be passed on to the variance +#' estimation formula. If several variables are to be associated with +#' the statistics, \code{lin} can be a list itself. +#' } +#' All other named elements in the output of \code{define_statistic_wrapper} are +#' treated as metadata (that may be used later on by \code{display_function}). +#' +#' \code{arg_type} is a named list of three elements that describes the nature +#' of the argument of \code{statistic_function}: \itemize{ +#' \item \code{data}: data argument(s), numerical vector(s) to be used +#' to calculate the point estimate and the linearized variable associated +#' with the statistic +#' \item \code{weight}: weight argument, numerical vector to be used +#' as row weights +#' \item \code{param}: parameters, non-data arguments to be used to +#' control some aspect of the computation} +#' +#' Statistic wrappers are quite flexible tools to apply a variance function +#' to an estimator requiring a linearization step (e.g. all estimators except +#' the estimator of a total) with virtually no additional complexity for the +#' end-user. +#' +#' \code{\link[=standard_statistic_wrapper]{standard statistic wrappers}} +#' are included within the gustave package and automatically added +#' to the variance estimation wrappers. New statistic wrappers can be defined +#' using the \code{define_statistic_wrapper} and then explicitly added to the +#' variance estimation wrappers using the \code{objects_to_include} argument. +#' +#' Note: To some extent, statistic wrappers can be seen as ggplot2 +#' \code{geom_} and \code{stat_} functions: they help the end-user in writing +#' down what he or she wants without having to go too deep into the details +#' of the corresponding layers. +#' +#' @return A function to be used within a variance estimation wrapper to estimate +#' a specific statistic (see examples). Its formals are the ones of +#' \code{statistic_function} with the addition of \code{by} and \code{where} +#' (for domain estimation, set to \code{NULL} by default). +#' +#' @author Martin Chevalier +#' +#' @seealso \code{\link[=standard_statistic_wrapper]{standard statistic wrappers}}, \code{\link{define_variance_wrapper}} +#' +#' @references +#' Deville J.-C. (1999), "Variance estimation for complex statistics and +#' estimators: linearization and residual techniques", \emph{Survey Methodology}, +#' 25:193–203 +#' +#' @examples ### Example from the Information and communication technologies (ICT) survey +#' +#' # Let's define a variance wrapper asfor the ICT survey +#' # as in the examples of the qvar function: +#' precision_ict <- qvar( +#' data = ict_sample, +#' dissemination_dummy = "dissemination", +#' dissemination_weight = "w_calib", +#' id = "firm_id", +#' scope_dummy = "scope", +#' sampling_weight = "w_sample", +#' strata = "strata", +#' nrc_weight = "w_nrc", +#' response_dummy = "resp", +#' hrg = "hrg", +#' calibration_weight = "w_calib", +#' calibration_var = c(paste0("N_", 58:63), paste0("turnover_", 58:63)), +#' define = TRUE +#' ) +#' precision_ict(ict_survey, mean(speed_quanti)) +#' +#' # Let's now redefine the mean statistic wrapper +#' mean2 <- define_statistic_wrapper( +#' statistic_function = function(y, weight){ +#' point <- sum(y * weight) / sum(weight) +#' lin <- (y - point) / sum(weight) +#' list(point = point, lin = lin, metadata = list(n = length(y))) +#' }, +#' arg_type = list(data = "y", weight = "weight") +#' ) +#' +#' # mean2 can now be used inside precision_ict (and yields +#' # the same results as the mean statistic wrapper) +#' precision_ict(ict_survey, mean(speed_quanti), mean2(speed_quanti)) +#' +#' @export + +define_statistic_wrapper <- function(statistic_function, + arg_type, + arg_not_affected_by_domain = NULL, + display_function = standard_display +){ + + # Step I: Control arguments consistency + inconsistent_arg <- list( + in_arg_type_not_in_statistic_function = setdiff(unlist(arg_type), names(formals(statistic_function))), + in_statistic_function_not_in_arg_type = setdiff(names(formals(statistic_function)), unlist(arg_type)), + in_arg_not_affected_by_domain_not_in_statistic_function = setdiff(arg_not_affected_by_domain, names(formals(statistic_function))) + ) + if(length(unlist(inconsistent_arg)) > 0) stop( + "Some arguments are inconsistent:", + if(length(inconsistent_arg[[1]]) > 0) paste("\n -", paste(inconsistent_arg[[1]], collapse = ", "), "in arg_type but not in statistic_function arguments") else "", + if(length(inconsistent_arg[[2]]) > 0) paste("\n -", paste(inconsistent_arg[[2]], collapse = ", "), "in statistic_function arguments but not in arg_type") else "", + if(length(inconsistent_arg[[3]]) > 0) paste("\n -", paste(inconsistent_arg[[3]], collapse = ", "), "in arg_not_affected_by_domain but not in statistic_function arguments") else "" + ) + if(is.null(arg_type$weight)) + stop("A weight argument must be provided in order to create a statistic wrapper.") + arg_domain <- list( + data = setdiff(arg_type$data, arg_not_affected_by_domain), + weight = setdiff(arg_type$weight, arg_not_affected_by_domain) + ) + + # Step II: Create the statistic wrapper + statistic_wrapper <- function(by = NULL, where = NULL){ + + # Step 1: Rewrite the call to add by and where from the variance wrapper call + execution_envir <- get_through_parent_frame("execution_envir") + evaluation_envir <- get("evaluation_envir", execution_envir) + call_list <- as.list(match.call(expand.dots = TRUE)) + if(!("by" %in% names(call_list))) call_list$by <- substitute(by, execution_envir) + if(!("where" %in% names(call_list))) call_list$where <- substitute(where, execution_envir) + + # Step 2: Rewrite the call to take standard evaluation into account + data <- eval(substitute(data), execution_envir) + if(!is.null(call_list$by)) call_list["by"] <- + replace_variable_name_with_symbol(call_list["by"], data = data) + if(!is.null(call_list$where)) call_list["where"] <- + replace_variable_name_with_symbol(call_list["where"], data = data) + data_arg <- replace_variable_name_with_symbol( + call_list[arg_type$data], data = data, single = FALSE + ) + call_list <- lapply(seq_along(data_arg[[1]]), function(c){ + call_list[names(data_arg)] <- lapply(data_arg, `[[`, c) + call_list + }) + + # Step 3: Initialize the call and metadata slot + data_as_list <- lapply(call_list, function(d) list( + call = as.call(d), + metadata = list( + call = paste(deparse(as.call(d), width.cutoff = 500L), collapse = ""), + by = NA, mod = NA + ) + )) + + # Step 4: Evaluate the arguments and create the data, weight and param slots + data_as_list <- lapply(data_as_list, function(d){ + d$data <- lapply( + as.list(d$call)[arg_type$data], + eval, envir = data, enclos = evaluation_envir + ) + d$weight <- lapply( + stats::setNames(arg_type$weight, arg_type$weight), + function(w) eval(substitute(reference_weight), envir = execution_envir) + ) + d$param <- lapply( + as.list(d$call)[arg_type$param], + eval, envir = evaluation_envir + ) + d$metadata$row_number <- seq_along(d$data[[1]]) + d + }) + if(all(sapply(data_as_list[[1]]$data, is.null))) return(NULL) + + # Step 5: Where + data_as_list <- lapply(data_as_list, function(d){ + if(is.null(d$call$where)) return(d) + where <- as.logical(eval(d$call$where, envir = data, enclos = evaluation_envir)) + if(!any(where)) stop("where argument excludes all observations.") + d$metadata$row_number <- d$metadata$row_number[where] + d$data[arg_domain$data] <- lapply(d$data[arg_domain$data], `[`, where) + d$weight[arg_domain$weight] <- lapply(d$weight[arg_domain$weight], `[`, where) + d + }) + + # Step 6: By + data_as_list <- unlist(lapply(data_as_list, function(d){ + if(is.null(d$call$by)) return(list(d)) + by <- droplevels(as.factor(eval(d$call$by, envir = data, enclos = evaluation_envir))) + by_split <- split(seq_along(by), by) + tmp <- lapply(levels(by), function(by_group){ + d_by <- d + in_by_group <- d_by$metadata$row_number %in% by_split[[by_group]] + if(!any(in_by_group)) return(NULL) + d_by$metadata$by <- by_group + d_by$metadata$row_number <- d_by$metadata$row_number[in_by_group] + d_by$data[arg_domain$data] <- + lapply(d_by$data[arg_domain$data], `[`, in_by_group) + d_by$weight[arg_domain$weight] <- + lapply(d_by$weight[arg_domain$weight], `[`, in_by_group) + d_by + }) + tmp[!sapply(tmp, is.null)] + }), recursive = FALSE) + + # Step 7: Handle factors and character variables in data + data_as_list <- unlist(lapply(data_as_list, function(d){ + if(!any(sapply(d$data, function(var) is.character(var) || is.factor(var)))) + return(list(d)) + if(length(arg_type$data) > 1) stop("Character or factor variables aren't allowed.") + tmp <- discretize_qualitative_var(d$data[[1]]) + lapply(colnames(tmp), function(mod){ + d_mod <- d + d_mod$metadata$mod <- mod + d_mod$data[[1]] <- tmp[, mod] + d_mod + }) + }), recursive = FALSE) + + # Step 8: Call the statistic function + data_as_list <- lapply(data_as_list, function(d){ + statistic_function_arg <- + unlist(unname(d[c("data", "weight", "param")]), recursive = FALSE) + tmp <- do.call(statistic_function, statistic_function_arg) + d$statistic_function <- statistic_function + d$point <- tmp$point + d$lin <- if(!is.list(tmp$lin)) list(tmp$lin) else tmp$lin + d$display_function <- display_function + d$metadata <- c(d$metadata, tmp[setdiff(names(tmp), c("point", "lin", ""))]) + d + }) + + data_as_list + + } + + # Step III: Finalize the statistic wrapper + + # Step III.1: Modify statistic_wrapper formals + formals(statistic_wrapper) <- c( + formals(statistic_function)[setdiff(names(formals(statistic_function)), arg_type$weight)], + formals(statistic_wrapper) + ) + + # Step III.2: Include objects in statistic_wrapper enclosing environment + e <- new.env(parent = globalenv()) + assign_all(objects = c("discretize_qualitative_var", "get_through_parent_frame", "replace_variable_name_with_symbol", "is_error", "is_variable_name"), to = e, from = asNamespace("gustave")) + assign_all(objects = c("statistic_function", "arg_type", "arg_domain", "display_function"), to = e, from = environment()) + statistic_wrapper <- change_enclosing(statistic_wrapper, envir = e) + + structure(statistic_wrapper, class = c("function", "gustave_statistic_wrapper")) + +} + +standard_display <- function(point, var, metadata, alpha){ + # TODO: If installed, use tibble to add more explicit column labels + output_df <- as.data.frame(metadata[!sapply(metadata, is.null)], stringsAsFactors = FALSE) + if(length(var) != length(point)) stop( + "The number of estimated variances does not match the number of point estimates. A specific display function could be needed.", + "\n", call. = FALSE + ) + output_df$est <- point + output_df$variance <- var + output_df$std <- sqrt(output_df$variance) + output_df$cv <- output_df$std * 100 / output_df$est + output_df$lower <- output_df$est - stats::qnorm(1 - alpha / 2) * output_df$std + output_df$upper <- output_df$est + stats::qnorm(1 - alpha / 2) * output_df$std + output_df +} +standard_display <- change_enclosing(standard_display, globalenv()) + + diff --git a/R/define_variance_wrapper.R b/R/define_variance_wrapper.R index ea96441..bdc68b4 100644 --- a/R/define_variance_wrapper.R +++ b/R/define_variance_wrapper.R @@ -1,321 +1,399 @@ - -#' Define a variance estimation wrapper - -#' @description Given a variance estimation \emph{function} (specific to a -#' survey), \code{define_variance_wrapper} defines a variance estimation -#' \emph{wrapper} easier to use (e.g. automatic domain estimation, -#' linearization). -#' -#' @param variance_function An R function, with input a data matrix and possibly -#' other arguments (e.g. parameters affecting the estimation of variance), -#' and output a numeric vector of estimated variances (or a list whose first -#' element is a numeric vector of estimated variances). -#' @param reference_id A vector containing the ids of all the responding units -#' of the survey. It is compared with \code{default$id} to check whether some -#' observations are missing in the survey file. Observations are reordered -#' according to \code{reference_id}. -#' @param default a named list specifying the default values for: \itemize{ -#' \item \code{id}: the name of the default identifying variable in the survey -#' file. It can also be an unevaluated expression (enclosed in \code{substitute()}) to be -#' evaluated within the survey file. -#' \item \code{weight}: the name of the default weight variable in the survey file. -#' It can also be an unevaluated expression (enclosed in \code{substitute()}) to be -#' evaluated within the survey file. -#' \item \code{stat}: the name of the default statistic to compute when none is specified. -#' It is set to \code{"total"} by default. -#' \item \code{alpha}: the default threshold for confidence interval derivation. -#' It is set to \code{0.05} by default. -#' } -#' @param objects_to_include A character vector indicating the name of -#' additional R objects to include within the variance wrapper. These objects -#' are to be used to carry out the variance estimation. -#' @param objects_to_include_from The environment to which the additional R -#' objects belong. -#' -#' @details Defining variance estimation wrappers is the \strong{key feature} of -#' the \code{gustave} package. -#' -#' Analytical variance estimation is often difficult to carry out by -#' non-specialists owing to the complexity of the underlying sampling -#' and estimation methodology. This complexity yields complex \emph{variance estimation -#' functions} which are most often only used by the sampling expert who -#' actually wrote them. A \emph{variance estimation wrapper} is an -#' intermediate function that is "wrapped around" the (complex) variance -#' estimation function in order to provide the non-specialist with -#' user-friendly features: \itemize{ \item checks for consistency between the -#' provided dataset and the survey characteristics \item factor discretization -#' \item domain estimation \item linearization of complex statistics (see -#' \code{\link[=linearization_wrapper_standard]{standard linearization wrappers}})} -#' -#' \code{define_variance_wrapper} allows the sampling expert to define a -#' variance estimation wrapper around a given variance estimation function and -#' set its default parameters. The produced variance estimation wrapper will -#' be stand-alone in the sense that it can contain additional data which would -#' \code{objects_to_include} and \code{objects_to_include_from} parameters). -#' -#' @return An R function that makes the estimation of variance based on the provided -#' variance function easier. Its parameters are: -#' \itemize{ -#' \item \code{data}: the survey data where the interest variables are stored -#' \item \code{...}: one or more calls to a linearization wrapper (see examples -#' and \code{\link[=linearization_wrapper_standard]{standard linearization wrappers}}) -#' \item \code{where}: a logical vector indicating a domain on which the variance -#' estimation is conducted -#' \item \code{by}: a qualitative variable whose levels are used to define domains -#' on which the variance estimation is conducted -#' \item \code{stat}: a character vector of size 1 indicating the linearization -#' wrapper to use when none is specified. Its default value depends on -#' the value of \code{default_stat} in \code{define_variance_wrapper} -#' \item \code{alpha}: a numeric vector of size 1 indicating the threshold -#' for confidence interval derivation. Its default value depends on -#' the value of \code{default_alpha} in \code{define_variance_wrapper} -#' \item \code{id}: a character vector of size 1 containing the name of -#' the identifying variable in the survey file. It can also be an -#' unevaluated expression (using \code{substitute()}) to be evaluated within -#' the survey file. Its default value depends on the value of -#' \code{default_id} in \code{define_variance_wrapper} -#' \item \code{envir}: an environment containing a binding to \code{data} -#' } -#' -#' @author Martin Chevalier -#' -#' @seealso \code{\link[=linearization_wrapper_standard]{standard linearization wrappers}} \code{\link{varDT}} -#' -#' @examples ### Example from the Information and communication technologies (ICT) survey -#' -#' # The subset of the (simulated) ICT survey has the following features: -#' # - stratified one-stage sampling design of 650 firms; -#' # - 612 responding firms, non-response correction through reweighting -#' # in homogeneous response groups based on economic sub-sector and turnover; -#' # - calibration on margins (number of firms and turnover broken down -#' # by economic sub-sector). -#' -#' # Step 1 : Definition of a variance function -#' -#' variance_function <- function(y){ -#' -#' # Calibration -#' y <- rescal(y, x = x) -#' -#' # Non-response -#' y <- add0(y, rownames = ict_sample$firm_id) -#' var_nr <- var_pois(y, pik = ict_sample$response_prob_est, w = ict_sample$w_sample) -#' -#' # Sampling -#' y <- y / ict_sample$response_prob_est -#' var_sampling <- var_srs(y, pik = 1 / ict_sample$w_sample, strata = ict_sample$division) -#' -#' var_sampling + var_nr -#' -#' } -#' -#' # With x the calibration variables matrix -#' x <- as.matrix(ict_survey[ -#' order(ict_survey$firm_id), -#' c(paste0("N_", 58:63), paste0("turnover_", 58:63)) -#' ]) -#' -#' # Test of the variance function -#' y <- as.matrix(ict_survey$speed_quanti) -#' rownames(y) <- ict_survey$firm_id -#' variance_function(y) -#' -#' # Step 2 : Definition of a variance wrapper -#' -#' variance_wrapper <- define_variance_wrapper( -#' variance_function = variance_function, -#' reference_id = ict_survey$firm_id, -#' default = list(id = "firm_id", weight = "w_calib"), -#' objects_to_include = c("x", "ict_sample") -#' ) -#' -#' # The objects "x" and "ict_sample" are embedded -#' # within the function variance_wrapper -#' ls(environment(variance_wrapper)) -#' # Note : variance_wrapper is a closure -#' # (http://adv-r.had.co.nz/Functional-programming.html#closures) -#' # As a consequence, the variance wrapper will work even if -#' # x is removed from globalenv() -#' rm(x) -#' -#' # Step 3 : Features of the variance wrapper -#' -#' # Better display of results -#' variance_wrapper(ict_survey, speed_quanti) -#' -#' # Mean linearization -#' variance_wrapper(ict_survey, mean(speed_quanti)) -#' # Ratio linearization -#' variance_wrapper(ict_survey, ratio(turnover, employees)) -#' -#' # Discretization of qualitative variables -#' variance_wrapper(ict_survey, speed_quali) -#' # On-the-fly recoding -#' variance_wrapper(ict_survey, speed_quali == "Between 2 and 10 Mbs") -#' -#' # 1-domain estimation -#' variance_wrapper(ict_survey, speed_quanti, where = division == "58") -#' # Multiple domains estimation -#' variance_wrapper(ict_survey, speed_quanti, by = division) -#' -#' # Multiple variables at a time -#' variance_wrapper(ict_survey, speed_quanti, big_data) -#' variance_wrapper(ict_survey, speed_quanti, mean(big_data)) -#' # Flexible syntax for where and by arguments -#' # (similar to the aes() function in ggplot2) -#' variance_wrapper(ict_survey, where = division == "58", -#' mean(speed_quanti), mean(big_data * 100) -#' ) -#' variance_wrapper(ict_survey, where = division == "58", -#' mean(speed_quanti), mean(big_data * 100, where = division == "61") -#' ) -#' variance_wrapper(ict_survey, where = division == "58", -#' mean(speed_quanti), mean(big_data * 100, where = NULL) -#' ) -#' -#' @export define_variance_wrapper -#' @import Matrix - -define_variance_wrapper <- function( - variance_function, reference_id, - default = list(stat = "total", alpha = 0.05), - objects_to_include = NULL, objects_to_include_from = parent.frame() -){ - - # TODO: add some sort of startup message on first run of the function - # whith the package version number and the github repo. Something like : - # "Variance wrapper generated by the gustave V.V on DD/MM/YYYY. See https://github.com/martinchevalier/gustave" for documentation and bug reports." - - # TODO: enable magrittr pipe %>% operations - - # Step 0 : Work with default argument - if(is.null(default$stat) && !("stat" %in% names(default))) default$stat <- "total" - if(is.null(default$alpha) && !("alpha" %in% names(default))) default$alpha <- 0.05 - - # Step 1 : Creating the variance estimation wrapper - variance_wrapper <- function( - data, ..., by = NULL, where = NULL, stat = NULL, alpha = NULL, - weight = NULL, id = NULL, display = TRUE, envir = parent.frame() - ){ - - if(!("package:Matrix" %in% search())) attachNamespace("Matrix") - - # Step 1.0: Retrieve information about the environments, - # the call and evaluate the data argument - evaluation_envir <- envir - execution_envir <- environment() - call <- match.call(expand.dots = TRUE) - substitute_data <- substitute(data) - eval_data <- eval(substitute_data, evaluation_envir) - - # Step 1.1 : Control identifiers - reference_id <- eval(reference_id) - id <- if(is.character(id)) eval_data[, id] else eval(id, eval_data) - in_reference_id_not_in_id <- setdiff(reference_id, id) - if(length(in_reference_id_not_in_id) > 0) - warning("Some observations from the survey appear to be missing. The variance estimation function may produce unexpected results.", call. = FALSE) - in_id_not_in_reference_id <- setdiff(id, reference_id) - if(length(in_id_not_in_reference_id) > 0) - stop("Some observations do not belong to the survey.", call. = FALSE) - - # Step 1.2 : Specify default values for stat, weight, by and where arguments - l <- eval(substitute(alist(...))) - l <- lapply(l, function(i){ - if(is.symbol(i) || !("gustave_linearization_wrapper" %in% class(eval(i[[1]])))) - i <- as.call(c(as.symbol(stat), i)) - i <- as.list(i) - for(j in environment(eval(i[[1]]))$arg_type$weight){ - if(!(j %in% names(i))) i[[j]] <- if(is.character(weight)) as.symbol(weight) else weight - } - if(!("by" %in% names(i))) i$by <- substitute(by, execution_envir) - if(!("where" %in% names(i))) i$where <- substitute(where, execution_envir) - as.call(i) - }) - # TODO: keep track of the non-automatically assigned weight arguments - # in order to display them in the call columns of the output (see - # define_linearization_wrapper, about row 44). - - # Step 1.3 : Call the linearization wrappers - labels <- if(!is.null(names(l))) names(l) else rep(NA, length(l)) - labels[labels %in% ""] <- NA - d <- unlist(lapply(seq_along(l), function(i){ - linearization_wrapper_call <- as.call(c(as.list(l[[i]]), list( - data = substitute_data, label = labels[i] - , evaluation_envir = evaluation_envir, execution_envir = execution_envir - ))) - eval(linearization_wrapper_call, envir = execution_envir) - }), recursive = FALSE) - if(is.null(d)) stop("No variable to estimate variance on.", call. = FALSE) - - # Step 1.4 : Build up the sparse matrix to be used in the estimation - d_matrix <- { - data <- lapply(d, function(k){ - t <- do.call(cbind, k$lin) - Matrix::sparseMatrix( - i = rep(k$metadata$bypos, NCOL(t)) - , j = rep(1:NCOL(t), each = NROW(t)), giveCsparse = FALSE - , x = c(t), dims = c(length(id), NCOL(t)), check = FALSE - ) - }) - data <- methods::as(Matrix::drop0(do.call(cbind, data)), "TsparseMatrix") - data@i <- as.integer(match(id, reference_id)[data@i + 1] - 1) - data@Dim <- c(length(reference_id), NCOL(data)) - data@Dimnames <- list(as.character(reference_id), NULL) - data - } - - # Step 1.5 : Call the variance estimation function - variance_function_args <- c( - list(d_matrix) - , lapply(names(formals(variance_function))[-1], get, envir = execution_envir) - ) - r <- suppressMessages(do.call(variance_function, variance_function_args)) - if(is.data.frame(r)) r <- as.matrix(r) - if(!is.list(r)) r <- list(var = r) - - # Step 1.6 Reorganize the results of the estimation - k <- 0; - d <- lapply(seq_along(d), function(i) c(d[[i]] - , list(var = lapply(d[[i]]$lin, function(j){ - tmp <- r[[1]][(k + 1):(k + NCOL(j))] - assign("k", (k + NCOL(j)), envir = execution_envir) - return(tmp) - })) - )) - - # Step 1.7 : Display the results if requested (the default) - if(display){ - d <- lapply(d, function(i) with(i, - display_function(var = var, metadata = metadata, alpha = alpha) - )) - names <- unique(do.call(base::c, lapply(d, names))) - d <- do.call(rbind, lapply(d, function(i){ - i[, setdiff(names, names(i))] <- NA - i[, names] - })) - d <- d[, sapply(d, function(i) !all(is.na(i)))] - rownames(d) <- NULL - d - }else invisible(d) - - } - - # Step 2 : Modify variance_wrapper arguments depending on the context - if(!is.null(default$id)) formals(variance_wrapper)$id <- substitute(default$id) - if(!is.null(default$weight)) formals(variance_wrapper)$weight <- substitute(default$weight) - if(!is.null(default$stat)) formals(variance_wrapper)$stat <- default$stat - if(!is.null(default$alpha)) formals(variance_wrapper)$alpha <- default$alpha - formals(variance_wrapper) <- c(formals(variance_wrapper), formals(variance_function)[names(formals(variance_function))[-1]]) - - # Step 3 : Include objects in variance_wrapper enclosing environment - e1 <- new.env(parent = globalenv()) - assign_all(objects = ls(asNamespace("gustave")), to = e1, from = asNamespace("gustave")) - e2 <- new.env(parent = e1) - assign_all(objects = c("variance_function", "reference_id"), to = e2, from = environment()) - assign_all(objects = objects_to_include, to = e2, from = objects_to_include_from) - variance_wrapper <- change_enclosing(variance_wrapper, envir = e2) - - structure(variance_wrapper, class = c("function", "gustave_variance_wrapper")) - -} - + +#' Define a variance estimation wrapper + +#' @description Given a variance estimation \emph{function} (specific to a +#' survey), \code{define_variance_wrapper} defines a variance estimation +#' \emph{wrapper} easier to use (e.g. automatic domain estimation, +#' linearization). +#' +#' @param variance_function An R function. It is the methodological workhorse of +#' the variance estimation: from a set of arguments including the variables +#' of interest (see below), it should return a vector of estimated variances. +#' See Details. +#' @param reference_id A vector containing the ids of all the responding units +#' of the survey. It can also be an unevaluated expression (enclosed in +#' \code{quote()}) to be evaluated within the execution environment of the wrapper. +#' It is compared with \code{default$id} (see below) to check whether +#' some observations are missing in the survey file. The matrix of variables +#' of interest passed on to \code{variance_function} has \code{reference_id} +#' as rownames and is ordered according to its values. +#' @param reference_weight A vector containing the reference weight of the survey. +#' It can also be an unevaluated expression (enclosed in \code{quote()}) to +#' be evaluated within the execution environment of the wrapper. +#' @param default_id A character vector of length 1, the name of the default +#' identifying variable in the survey file. It can also be an unevaluated +#' expression (enclosed in \code{quote()}) to be evaluated within the survey file. +#' @param technical_data A named list of technical data needed to perform +#' the variance estimation (e.g. sampling strata, first- or second-order +#' probabilities of inclusion, estimated response probabilities, calibration +#' variables). Its names should match the names of the corresponding arguments +#' in \code{variance_function}. +#' @param technical_param A named list of technical parameters used to control +#' some aspect of the variance estimation process (e.g. alternative methodology). +#' Its names should match the names of the corresponding arguments in \code{variance_function}. +#' @param objects_to_include (Advanced use) A character vector indicating the name of +#' additional R objects to include within the variance wrapper. + +#' +#' +#' @details Defining variance estimation wrappers is the \strong{key feature} of +#' the \code{gustave} package. It is the workhorse of the ready-to-use +#' \code{\link{qvar}} function and should be used directly to handle more complex +#' cases (e.g. surveys with several stages or balanced sampling). +#' +#' Analytical variance estimation is often difficult to carry out by +#' non-specialists owing to the complexity of the underlying sampling +#' and estimation methodology. This complexity yields complex \emph{variance +#' estimation functions} which are most often only used by the sampling expert +#' who actually wrote them. A \emph{variance estimation wrapper} is an +#' intermediate function that is "wrapped around" the (complex) variance +#' estimation function in order to provide the non-specialist with +#' user-friendly features (see examples): \itemize{ +#' \item calculation of complex statistics (see +#' \code{\link[=standard_statistic_wrapper]{standard statistic wrappers}}) +#' \item domain estimation +#' \item handy evaluation and factor discretization +#' } +#' +#' \code{define_variance_wrapper} allows the sampling expert to define a +#' variance estimation wrapper around a given variance estimation function and +#' set its default parameters. The produced variance estimation wrapper is +#' standalone in the sense that it contains all technical data necessary +#' to carry out the estimation (see \code{technical_data}). +#' +#' The arguments of the \code{variance_function} fall into three types: \itemize{ +#' \item the data argument (mandatory, only one allowed): the numerical matrix of +#' variables of interest to apply the variance estimation formula on +#' \item technical data arguments (optional, one or more allowed): technical +#' and methodological information used by the variance estimation function +#' (e.g. sampling strata, first- or second-order probabilities of inclusion, +#' estimated response probabilities, calibration variables) +#' \item technical parameters (optional, one or more allowed): non-data arguments +#' to be used to control some aspect of the variance estimation (e.g. alternative +#' methodology)} +#' +#' \code{technical_data} and \code{technical_param} are used to determine +#' which arguments of \code{variance_function} relate to technical information, +#' the only remaining argument is considered as the data argument. +#' +#' @return An R function that makes the estimation of variance based on the +#' provided variance function easier. Its parameters are: \itemize{ \item +#' \code{data}: one or more calls to a statistic wrapper (e.g. \code{total()}, +#' \code{mean()}, \code{ratio()}). See examples and +#' \code{\link[=standard_statistic_wrapper]{standard statistic wrappers}}) and +#' \code{\link[=standard_statistic_wrapper]{standard statistic wrappers}}) +#' \item \code{where}: a logical vector indicating a domain on which the +#' variance estimation is to be performed \item \code{by}: q qualitative +#' variable whose levels are used to define domains on which the variance +#' estimation is performed \item \code{alpha}: a numeric vector of length 1 +#' indicating the threshold for confidence interval derivation (\code{0.05} by +#' default) \item \code{display}: a logical verctor of length 1 indicating +#' whether the result of the estimation should be displayed or not \item +#' \code{id}: a character vector of size 1 containing the name of the +#' identifying variable in the survey file. Its default value depends on the +#' value of \code{default_id} in \code{define_variance_wrapper} \item +#' \code{envir}: an environment containing a binding to \code{data}} +#' +#' @author Martin Chevalier +#' +#' @references Rao, J.N.K (1975), "Unbiased variance estimation for multistage designs", +#' \emph{Sankhya}, C n°37 +#' +#' @seealso \code{\link{qvar}}, \code{\link[=standard_statistic_wrapper]{standard statistic wrappers}}, \code{\link{varDT}} +#' +#' @examples ### Example from the Labour force survey (LFS) +#' +#' # The (simulated) Labour force survey (LFS) has the following characteristics: +#' # - first sampling stage: balanced sampling of 4 areas (each corresponding to +#' # about 120 dwellings) on first-order probability of inclusion (proportional to +#' # the number of dwellings in the area) and total annual income in the area. +#' # - second sampling stage: in each sampled area, simple random sampling of 20 +#' # dwellings +#' # - neither non-response nor calibration +#' +#' # As this is a multi-stage sampling design with balanced sampling at the first +#' # stage, the qvar function does not apply. A variance wrapper can nonetheless +#' # be defined using the core define_variance_wrapper function. +#' +#' # Step 1 : Definition of the variance function and the corresponding technical data +#' +#' # In this context, the variance estimation function specific to the LFS +#' # survey can be defined as follows: +#' +#' var_lfs <- function(y, ind, dwel, area){ +#' +#' variance <- list() +#' +#' # Variance associated with the sampling of the dwellings +#' y <- sum_by(y, ind$id_dwel) +#' variance[["dwel"]] <- var_srs( +#' y = y, pik = dwel$pik_dwel, strata = dwel$id_area, +#' w = (1 / dwel$pik_area^2 - dwel$q_area) +#' ) +#' +#' # Variance associated with the sampling of the areas +#' y <- sum_by(y = y, by = dwel$id_area, w = 1 / dwel$pik_dwel) +#' variance[["area"]] <- varDT(y = y, precalc = area) +#' +#' Reduce(`+`, variance) +#' +#' } +#' +#' # where y is the matrix of variables of interest and ind, dwel and area the technical data: +#' +#' technical_data_lfs <- list() +#' +#' # Technical data at the area level +#' # The varDT function allows for the pre-calculation of +#' # most of the methodological quantities needed. +#' technical_data_lfs$area <- varDT( +#' y = NULL, +#' pik = lfs_samp_area$pik_area, +#' x = as.matrix(lfs_samp_area[c("pik_area", "income")]), +#' id = lfs_samp_area$id_area +#' ) +#' +#' # Technical data at the dwelling level +#' # In order to implement Rao (1975) formula for two-stage samples, +#' # we associate each dwelling with the diagonal term corresponding +#' # to its area in the first-stage variance estimator: +#' lfs_samp_dwel$q_area <- with(technical_data_lfs$area, setNames(diago, id))[lfs_samp_dwel$id_area] +#' technical_data_lfs$dwel <- lfs_samp_dwel[c("id_dwel", "pik_dwel", "id_area", "pik_area", "q_area")] +#' +#' # Technical data at the individual level +#' technical_data_lfs$ind <- lfs_samp_ind[c("id_ind", "id_dwel", "sampling_weight")] +#' +#' # Test of the variance function var_lfs +#' y <- matrix(as.numeric(lfs_samp_ind$unemp), ncol = 1, dimnames = list(lfs_samp_ind$id_ind)) +#' with(technical_data_lfs, var_lfs(y = y, ind = ind, dwel = dwel, area = area)) +#' +#' +#' # Step 2 : Definition of the variance wrapper +#' +#' # Call of define_variance_wrapper +#' precision_lfs <- define_variance_wrapper( +#' variance_function = var_lfs, +#' technical_data = technical_data_lfs, +#' reference_id = technical_data_lfs$ind$id_ind, +#' reference_weight = technical_data_lfs$ind$sampling_weight, +#' default_id = "id_ind" +#' ) +#' +#' # Test +#' precision_lfs(lfs_samp_ind, unemp) +#' +#' # The variance wrapper precision_lfs has the same features +#' # as variance wrappers produced by the qvar function (see +#' # qvar examples for more details). +#' +#' @import Matrix +#' @export + +define_variance_wrapper <- function(variance_function, + reference_id, + reference_weight, + default_id = NULL, + technical_data = NULL, + technical_param = NULL, + objects_to_include = NULL +){ + + # TODO: Add some sort of startup message on first run of the function (check what Matrix does) + sys_time <- Sys.time() + session_info <- utils::sessionInfo() + + # Step I: Controls + + # Missing arguments + is_missing <- c( + variance_function = missing(variance_function), + reference_id = missing(reference_id), + reference_weight = missing(reference_weight) + ) + if(any(is_missing)) stop( + "The following arguments are missing: ", + paste(names(which(is_missing)), collapse = ", "), "." + ) + + # Determine argument type + names_formals_variance_function <- names(formals(variance_function)) + names_technical_data <- names_else_NA(technical_data) + if(anyNA(names_technical_data)) stop("All elements of technical_data must be named.") + if(!all(names_technical_data %in% names_formals_variance_function)) + stop("All elements of technical_data must match an argument of variance_function.") + names_technical_param <- names_else_NA(technical_param) + if(anyNA(names_technical_param)) stop("All elements of technical_param must be named.") + if(!all(names_technical_param %in% names_formals_variance_function)) + stop("All elements of technical_param must match an argument of variance_function.") + names_remaining_args <- setdiff(names_formals_variance_function, c(names_technical_data, names_technical_param)) + if(length(names_remaining_args) == 0) stop( + "variance_function appears to have only technical arguments (identified with technical_data and technical_param).", + " It must also have a data argument (see the Details section in help)." + ) + if(length(names_remaining_args) > 1) stop( + "variance_function appears to have several data arguments (", paste(names_remaining_args, collapse = ", "), + ") where it should only have one. Use technical_data and technical_param to identify the technical arguments (see the Details section in help)." + ) + arg_type <- list(data = names_remaining_args, tech_data = names_technical_data, tech_param = names_technical_param) + + # Step II: Create the variance wrapper + variance_wrapper <- function( + data, ..., by = NULL, where = NULL, id, + alpha = 0.05, display = TRUE, envir = parent.frame() + ){ + + if(!("package:Matrix" %in% search())) attachNamespace("Matrix") + + # Step 1: Preliminary operations and controls + + # Step 1.1: Environments and missing arguments + evaluation_envir <- envir + execution_envir <- environment() + is_missing <- c( + data = missing(data), + id = missing(id) && is.symbol(formals()$id) + ) + if(any(is_missing)) stop( + "The following arguments are missing: ", + paste(names(which(is_missing)), collapse = ", "), "." + ) + + # Step 1.2 Evaluation + data <- eval(data, envir = evaluation_envir) + reference_id <- eval(reference_id) + id <- tryCatch( + eval(substitute(id), envir = execution_envir), + error = function(e) substitute(id, execution_envir) + ) + id <- if(is.character(id)) data[[id]] else eval(id, data) + reference_weight <- eval(reference_weight) + + # Step 1.3: Controls + in_reference_id_not_in_id <- setdiff(reference_id, id) + if(length(in_reference_id_not_in_id) > 0) + warn("Some observations from the survey appear to be missing. The variance estimation function may produce unexpected results.") + in_id_not_in_reference_id <- setdiff(id, reference_id) + if(length(in_id_not_in_reference_id) > 0){ + warn(length(in_id_not_in_reference_id), " observations do not match any responding units of the survey. They are discarded.") + data <- data[id %in% reference_id, ] + id <- id[id %in% reference_id] + } + if(!identical(match_id <- match(reference_id, id), seq_along(reference_id))){ + warn( + "The inputted id variable (id argument) appears not to match the reference ", + "id variable provided when the variance wrapper was defined: it is reordered ", + "and everything should be fine. Issues may nonetheless arise if part of the call ", + "is to be evaluated outside of the inputted data.frame (data argument)." + ) + data <- data[match_id, ] + } + + + # Step 2: Handling domains, qualitative variables and linearization + statistic_wrapper_list <- eval(substitute(alist(...))) + statistic_wrapper_label <- names_else_NA(statistic_wrapper_list) + data_as_list <- unlist(lapply(seq_along(statistic_wrapper_list), function(i){ + + call <- statistic_wrapper_list[[i]] + + # Add a statistic wrapper when none is spefified + if(is.symbol(call) || !is_statistic_wrapper(eval(call[[1]]))) + call <- as.call(c(quote(total), call)) + + # Evaluate the statistic wrapper + d <- eval(call) + if(is.null(d)) return(d) + + # Add labels + lapply(d, function(slice){ + slice$metadata$label <- statistic_wrapper_label[i] + slice + }) + + }), recursive = FALSE) + if(is.null(data_as_list)) stop("No variable to estimate variance on.") + + + # Step 3: Variance estimation + + # Step 3.1: Build up the sparse matrix of linearized variables to be used in the estimation + data_as_Matrix <- list( + lin = Matrix::sparseMatrix( + i = unlist(lapply(data_as_list, function(slice) rep(slice$metadata$row_number, length(slice$lin))), use.names = FALSE), + p = c(0, cumsum(unlist(lapply(data_as_list, function(slice) + rep(length(slice$metadata$row_number), length(slice$lin)) + ), use.names = FALSE))), + x = unlist(lapply(data_as_list, function(slice) do.call(base::c, slice$lin)), use.names = FALSE), + dims = c(length(reference_id), sum(sapply(lapply(data_as_list, `[[`, "lin"), length))), + dimnames = list(reference_id, NULL), + check = FALSE + ), + slice_number = unlist(lapply(seq_along(data_as_list), function(i) + rep(i, sapply(lapply(data_as_list, `[[`, "lin"), length)[i]) + ), use.names = FALSE) + ) + + # Step 3.2: Call the variance estimation function + variance_function_args <- c( + stats::setNames(list(data_as_Matrix$lin), arg_type$data), + stats::setNames(lapply(arg_type$tech_param, get, envir = execution_envir), arg_type$tech_param), + technical_data + ) + variance_function_result <- suppressMessages(do.call(variance_function, variance_function_args)) + + # Step 3.3: Test and reorganize variance_function results + is_list_variance_function_result <- is.list(variance_function_result) + if(!is_list_variance_function_result) variance_function_result <- list(var = variance_function_result) + if(!any(names(variance_function_result) == "var")) + stop("At least one output of variance_function should be named \"var\".") + if(!is.vector(variance_function_result$var)) + stop("The ", if(is_list_variance_function_result) "\"var\" " else NULL,"output of variance_function should be a vector.") + data_as_Matrix <- c(data_as_Matrix, variance_function_result) + + # Step 3.4: Reintroduce the "var" output of variance_function within data_as_list + data_as_list <- lapply(seq_along(data_as_list), function(i){ + slice <- data_as_list[[i]] + slice$var <- data_as_Matrix$var[data_as_Matrix$slice_number == i] + slice + }) + + + # Step 4: Display the results (if requested) + if(!display) return(invisible(list( + data_as_list = data_as_list, data_as_Matrix = data_as_Matrix + ))) + list_output_df <- lapply(data_as_list, function(i) with(i, display_function( + point = point, var = var, metadata = metadata[c("label", "call", "mod", "by", "n")], alpha = alpha + ))) + rbind_output_df(list_output_df) + + } + + # Step III: Finalize the variance wrapper + + # Step III.1: Modify variance wrapper arguments depending on the context + if(!is.null(default_id)) formals(variance_wrapper)$id <- default_id + + # Step III.2: Add variance_function technical parameters to variance_wrapper arguments + # (just after the ...) + add_technical_param_after <- match("...", names(formals(variance_wrapper))) + formals(variance_wrapper) <- c( + formals(variance_wrapper)[1:add_technical_param_after], + technical_param, + formals(variance_wrapper)[(add_technical_param_after + 1):length(formals(variance_wrapper))] + ) + + # Step III.3: Include objects in variance_wrapper enclosing environment + e1 <- new.env(parent = globalenv()) + assign_all(objects = ls(asNamespace("gustave")), to = e1, from = asNamespace("gustave")) + e2 <- new.env(parent = e1) + assign_all(objects = c( + "sys_time", "session_info", + "variance_function", "reference_id", "reference_weight", "technical_data", "arg_type" + ), to = e2, from = environment()) + assign_all(objects = objects_to_include, to = e2, from = parent.frame()) + variance_wrapper <- change_enclosing(variance_wrapper, envir = e2) + + structure(variance_wrapper, class = c("function", "gustave_variance_wrapper")) + +} diff --git a/R/linearization_wrapper_standard.R b/R/linearization_wrapper_standard.R deleted file mode 100644 index 206a222..0000000 --- a/R/linearization_wrapper_standard.R +++ /dev/null @@ -1,137 +0,0 @@ -#' Standard linearization wrappers -#' -#' @description Functions to be used within variance estimation -#' wrappers in order to perform on-the-fly linearizations (see Details). -#' -#' @param y A vector corresponding to the (sole) variable to estimate -#' variance on. If not numeric (character or factor), it is -#' automatically discretized. -#' @param num,num1,num2 Numerical vector(s) corresponding to the numerator(s) -#' to be used in the estimation. -#' @param denom,denom1,denom2 Numerical vector(s) corresponding to the denominator(s) -#' to be used in the estimation. -#' @param by Factor vector (character vectors are coerced to factors) whose levels are used -#' to break down the estimation by domains. -#' @param where Logical vector indicating the domain to perform variance estimation on. -#' @param ... Technical parameters passed on to helper functions -#' within the linearization wrapper. -#' -#' @details When the estimator is not the estimator of a total, the application of -#' analytical variance estimation formulae developed for the estimator of a total -#' is not straightforward (Deville, 1999). An asymptotically unbiased variance -#' estimator can nonetheless be obtained if the estimation of variance is performed -#' on a variable obtained from the original data through a linerization step. -#' -#' The \code{ratio}, \code{mean}, \code{diff_of_ratio} and -#' \code{ratio_of_ratio} functions implement the standard linearization -#' techniques respectively for the ratio, mean, difference of ratios and -#' ratio of ratios estimators, as presented for example in (Caron, 1998). -#' The \code{total} function does not perform any linearization -#' (as none is needed for the estimator of a total) and solely adds the technical -#' features required to use the linearization wrapper within the \code{\link[=define_variance_wrapper]{variance wrappers}}. -#' -#' @seealso \code{\link{define_variance_wrapper}} -#' -#' @references -#' Caron N. (1998), "Le logiciel Poulpe : aspects méthodologiques", \emph{Actes -#' des Journées de méthodologie statistique} \url{http://jms-insee.fr/jms1998s03_1/} -#' -#' Deville J.-C. (1999), "Variance estimation for complex statistics and -#' estimators: linearization and residual techniques", \emph{Survey Methodology}, -#' 25:193–203 -#' -#' @examples # See define_variance_wrapper examples -#' -#' @author Martin Chevalier -#' -#' @name linearization_wrapper_standard -#' @aliases total ratio mean diff_or_ratio ratio_of_ratio - -NULL - -#' @rdname linearization_wrapper_standard -total <- define_linearization_wrapper( - linearization_function = function(y, weight){ - na <- is.na(y) - y[na] <- 0 - total <- sum(y * weight) - list( - lin = list(y), - metadata = list(est = total, n = sum(!na)) - ) - }, - arg_type = list(data = "y" , weight = "weight"), - allow_factor = TRUE -) - -#' @rdname linearization_wrapper_standard -ratio <- define_linearization_wrapper( - linearization_function = function(num, denom, weight){ - na <- is.na(num) | is.na(denom) - num[na] <- 0 - denom[na] <- 0 - est_num <- sum(num * weight) - est_denom <- sum(denom * weight) - ratio <- est_num / est_denom - lin <- (num - ratio * denom ) / est_denom - list( - lin = list(lin), - metadata = list(est = ratio, n = sum(!na), est_num = est_num, est_denom = est_denom) - ) - }, - arg_type = list(data = c("num", "denom") , weight = "weight") -) - -#' @rdname linearization_wrapper_standard -mean <- define_linearization_wrapper( - linearization_function = function(y, weight){ - environment(ratio)$linearization_function(num = y, denom = rep(1, length(y)), weight = weight) - }, - arg_type = list(data = "y" , weight = "weight"), - allow_factor = TRUE -) - -#' @rdname linearization_wrapper_standard -diff_of_ratio <- define_linearization_wrapper( - linearization_function = function(num1, denom1, num2, denom2, weight){ - na <- is.na(num1) | is.na(denom1) | is.na(num2) | is.na(denom2) - num1[na] <- 0 - denom1[na] <- 0 - num2[na] <- 0 - denom2[na] <- 0 - ratio1 <- environment(ratio)$linearization_function(num = num1, denom = denom1, weight = weight) - ratio2 <- environment(ratio)$linearization_function(num = num2, denom = denom2, weight = weight) - lin <- ratio1$lin[[1]] - ratio2$lin[[1]] - est <- ratio1$metadata$est - ratio2$metadata$est - list( - lin = list(lin), - metadata = list(est = est, n = sum(!na)) - ) - }, - arg_type = list(data = c("num1", "denom1", "num2", "denom2") , weight = "weight") -) - -#' @rdname linearization_wrapper_standard -ratio_of_ratio <- define_linearization_wrapper( - linearization_function = function(num1, denom1, num2, denom2, weight){ - na <- is.na(num1) | is.na(denom1) | is.na(num2) | is.na(denom2) - num1[na] <- 0 - denom1[na] <- 0 - num2[na] <- 0 - denom2[na] <- 0 - est_num1 <- sum(num1 * weight) - est_denom1 <- sum(denom1 * weight) - est_num2 <- sum(num2 * weight) - est_denom2 <- sum(denom2 * weight) - est <- (est_num1 / est_denom1) / (est_num2 / est_denom2) - lin <- est * ( - (num1 / est_num1) - (num2 / est_num2) - - (denom1 / est_denom1) + (denom2 / est_denom2) - ) - list( - lin = list(lin), - metadata = list(est = est, n = sum(!na)) - ) - }, - arg_type = list(data = c("num1", "denom1", "num2", "denom2") , weight = "weight") -) diff --git a/R/qvar.R b/R/qvar.R new file mode 100644 index 0000000..ebfdf72 --- /dev/null +++ b/R/qvar.R @@ -0,0 +1,671 @@ + + + +#' Quickly perform a variance estimation in common cases +#' +#' @description \code{qvar} (for "quick variance estimation") is a function +#' performing analytical variance estimation in most common cases, that is: +#' \itemize{\item stratified simple random sampling \item non-response +#' correction (if any) through reweighting \item calibration (if any)} +#' +#' Used with \code{define = TRUE}, it defines a so-called variance wrapper, that +#' is a standalone ready-to-use function that can be applied to the survey dataset +#' without having to specify the methodological characteristics of the survey +#' (see \code{\link{define_variance_wrapper}}). +#' +#' @param data The \code{data.frame} containing all the technical information +#' required to prepare the variance estimation process (see other arguments +#' below). Note that this file should contain all the units sampled, +#' including the out-of-scope and non-responding units. If a variance +#' estimation is to be performed right away (when \code{define = FALSE}), +#' it should also contain the variables of interest. +#' @param ... One or more calls to a statistic wrapper (e.g. \code{total()}, +#' \code{mean()}, \code{ratio()}). See examples and +#' \code{\link[=standard_statistic_wrapper]{standard statistic wrappers}} +#' @param where A logical vector indicating a domain on which the variance +#' estimation is to be performed. +#' @param by A qualitative variable whose levels are used to define domains +#' on which the variance estimation is performed. +#' @param alpha A numeric vector of length 1 indicating the threshold +#' for confidence interval derivation (\code{0.05} by default). +#' @param display A logical verctor of length 1 indicating whether +#' the result of the estimation should be displayed or not. +#' +#' @param id The identification variable of the units in \code{data}. +#' It should be unique for each row in \code{data} and not contain any +#' missing values. +#' @param dissemination_dummy A character vector of length 1, the name +#' of the logical variable in \code{data} indicating whether the unit +#' does appear in the disseminated file and should be used for point +#' estimates. It should not contain any missing values. +#' @param dissemination_weight A character vector of length 1, the name +#' of the numerical variable in \code{data} corresponding to the +#' dissemination weight of the survey. It should not contain any missing +#' values. +#' +#' @param sampling_weight A character vector of length 1, the name of the +#' numeric variable in \code{data} corresponding to the sampling weights +#' of the survey. It should not contain any missing values. +#' @param strata A character vector of length 1, the name of the factor +#' variable in \code{data} whose level match the stratification +#' used in the survey. Character variables are coerced to factor. +#' If defined, it should not contain any missing value. If \code{NULL}, +#' the variance estimation process does not take any stratification +#' into account. +#' +#' @param scope_dummy A character vector of length 1, the name of the logical +#' variable in \code{data} indicating whether the unit belongs to the +#' scope of the survey or not. Numerical variables are coerced to logical. +#' If defined, it should not contain any missing value. If \code{NULL}, +#' all units are supposed to be within the scope of the survey. +#' +#' @param nrc_weight A character vector of length 1, the name of the +#' numerical variable in \code{data} corresponding to the weights +#' after non-response correction. If defined, all responding units +#' should have a non-missing value. If \code{NULL}, all +#' units are supposed to be responding and the variance estimation +#' process does not include a second phase in order to take non-response +#' into account. +#' @param response_dummy A character vector of length 1, the name of of the logical +#' variable in \code{data} indicating whether the unit is a responding +#' unit or not. Numerical variables are coerced to logical. \code{response_dummy} +#' should be defined as long as a \code{nrc_weight} is provided. All units +#' in the scope of the survey should have a non-missing value. +#' @param nrc_dummy A character vector of length 1, the name of +#' the logical variable in \code{data} indicating whether the +#' units did take part in the non-response correction process. +#' All units in the scope of the survey should have a non-missing +#' value. +#' +#' @param calibration_weight A character vector of length 1, the name of the +#' numerical variable in \code{data} corresponding to the calibrated +#' weights. If defined, all responding units should have +#' a non-missing value. If \code{NULL}, the variance estimation +#' process does not take any calibration step into account. +#' @param calibration_dummy A character vector of length 1, the name of of the logical +#' variable in \code{data} indicating whether the unit did take part +#' in the calibration process or not. Numerical variables are coerced to +#' logical. If defined, all responding units should have a non-missing +#' value. If \code{NULL}, calibration is supposed to have been performed +#' on all responding units. +#' @param calibration_var A character vector, the name of the variable(s) used in +#' the calibration process. Logical variables are coerced to numeric. +#' Character and factor variables are automatically discretized. +#' \code{calibration_var} should be defined as long as a \code{calibration_weight} is +#' provided. All units taking part in the calibration process should have +#' only non-missing values for all variables in \code{calibration_var}. +#' +#' @param define Logical vector of lentgh 1. Should a variance wrapper +#' be defined instead of performing a variance estimation (see details +#' and examples)? +#' @param envir An environment containing a binding to \code{data}. +#' +#' @details \code{qvar} performs not only technical but also +#' methodological checks in order to ensure that the standard variance +#' estimation methodology does apply (e.g. equal probability of +#' inclusion within strata, number of units per stratum). +#' +#' Used with \code{define = TRUE}, the function returns a variance +#' estimation \emph{wrapper}, that is a ready-to-use function that +#' implements the described variance estimation methodology and +#' contains all necessary data to do so (see examples). +#' +#' Note: To some extent, \code{qvar} is analogous to the \code{qplot} function +#' in the ggplot2 package, as it is an easier-to-use function for common +#' cases. More complex cases are to be handled by using the core functions of +#' the gustave package, e.g. \code{\link{define_variance_wrapper}}. +#' +#' @seealso \code{\link{define_variance_wrapper}}, \code{\link{standard_statistic_wrapper}} +#' +#' @examples ### Example from the Information and communication technologies (ICT) survey +#' +#' # The (simulated) Information and communication technologies (ICT) survey +#' # has the following characteristics: +#' # - stratified one-stage sampling design +#' # - non-response correction through reweighting in homogeneous response groups +#' # - calibration on margins. +#' +#' # The ict_survey data.frame is a (simulated) subset of the ICT +#' # survey file containing the variables of interest for the 612 +#' # responding firms. +#' +#' # The ict_sample data.frame is the (simulated) sample of 650 +#' # firms corresponding to the ict_survey file. It contains all +#' # technical information necessary to estimate a variance with +#' # the qvar() function. +#' +#' ## Methodological description of the survey +#' +#' # Direct call of qvar() +#' qvar( +#' +#' # Sample file +#' data = ict_sample, +#' +#' # Dissemination and identification information +#' dissemination_dummy = "dissemination", +#' dissemination_weight = "w_calib", +#' id = "firm_id", +#' +#' # Scope +#' scope_dummy = "scope", +#' +#' # Sampling design +#' sampling_weight = "w_sample", +#' strata = "strata", +#' +#' # Non-response correction +#' nrc_weight = "w_nrc", +#' response_dummy = "resp", +#' hrg = "hrg", +#' +#' # Calibration +#' calibration_weight = "w_calib", +#' calibration_var = c(paste0("N_", 58:63), paste0("turnover_", 58:63)), +#' +#' # Statistic(s) and variable(s) of interest +#' mean(employees) +#' +#' ) +#' +#' # Definition of a variance estimation wrapper +#' precision_ict <- qvar( +#' +#' # As before +#' data = ict_sample, +#' dissemination_dummy = "dissemination", +#' dissemination_weight = "w_calib", +#' id = "firm_id", +#' scope_dummy = "scope", +#' sampling_weight = "w_sample", +#' strata = "strata", +#' nrc_weight = "w_nrc", +#' response_dummy = "resp", +#' hrg = "hrg", +#' calibration_weight = "w_calib", +#' calibration_var = c(paste0("N_", 58:63), paste0("turnover_", 58:63)), +#' +#' # Replacing the variables of interest by define = TRUE +#' define = TRUE +#' +#' ) +#' +#' # Use of the variance estimation wrapper +#' precision_ict(ict_sample, mean(employees)) +#' +#' # The variance estimation wrapper can also be used on the survey file +#' precision_ict(ict_survey, mean(speed_quanti)) +#' +#' ## Features of the variance estimation wrapper +#' +#' # Several statistics in one call (with optional labels) +#' precision_ict(ict_survey, +#' "Mean internet speed in Mbps" = mean(speed_quanti), +#' "Turnover per employee" = ratio(turnover, employees) +#' ) +#' +#' # Domain estimation with where and by arguments +#' precision_ict(ict_survey, +#' mean(speed_quanti), +#' where = employees >= 50 +#' ) +#' precision_ict(ict_survey, +#' mean(speed_quanti), +#' by = division +#' ) +#' +#' # Domain may differ from one estimator to another +#' precision_ict(ict_survey, +#' "Mean turnover, firms with 50 employees or more" = mean(turnover, where = employees >= 50), +#' "Mean turnover, firms with 100 employees or more" = mean(turnover, where = employees >= 100) +#' ) +#' +#' # On-the-fly evaluation (e.g. discretization) +#' precision_ict(ict_survey, mean(speed_quanti > 100)) +#' +#' # Automatic discretization for qualitative (character or factor) variables +#' precision_ict(ict_survey, mean(speed_quali)) +#' +#' # Standard evaluation capabilities +#' variables_of_interest <- c("speed_quanti", "speed_quali") +#' precision_ict(ict_survey, mean(variables_of_interest)) +#' +#' # Integration with %>% and dplyr +#' library(magrittr) +#' library(dplyr) +#' ict_survey %>% +#' precision_ict("Internet speed above 100 Mbps" = mean(speed_quanti > 100)) %>% +#' select(label, est, lower, upper) +#' +#' @export + +qvar <- function(data, ..., by = NULL, where = NULL, + alpha = 0.05, display = TRUE, + id, dissemination_dummy, dissemination_weight, + sampling_weight, strata = NULL, + scope_dummy = NULL, + nrc_weight = NULL, response_dummy = NULL, nrc_dummy = NULL, + calibration_weight = NULL, calibration_dummy = NULL, calibration_var = NULL, + define = FALSE, envir = parent.frame() + # TODO: Add objects_to_include +){ + + # Step 1: Define the variance wrapper + call <- as.list(match.call())[-1] + call$envir <- envir + qvar_variance_wrapper <- do.call( + define_qvar_variance_wrapper, + call[names(call) %in% names(formals(define_qvar_variance_wrapper))] + ) + + # Step 2: Export the variance wrapper + if(define){ + note("As define = TRUE, a ready-to-use variance wrapper is (invisibly) returned.") + return(invisible(qvar_variance_wrapper)) + } + + # Step 3: Estimate variance + qvar_data <- data[data[, id] %in% environment(qvar_variance_wrapper)$reference_id, ] + call$data <- substitute(qvar_data) + call$envir <- environment() + do.call( + qvar_variance_wrapper, + call[names(call) == "" | names(call) %in% names(formals(qvar_variance_wrapper))] + ) + +} + + + +# Unexported (and undocumented) functions + +define_qvar_variance_wrapper <- function(data, id, dissemination_dummy, dissemination_weight, + sampling_weight, strata = NULL, + scope_dummy = NULL, + nrc_weight = NULL, response_dummy = NULL, nrc_dummy = NULL, + calibration_weight = NULL, calibration_var = NULL, calibration_dummy = NULL, + envir = parent.frame() +){ + + # Step 1: Control arguments consistency and display the welcome message ---- + + # Step 1.1: Arguments consistency + is_missing <- c( + data = missing(data), + id = missing(id), + dissemination_dummy = missing(dissemination_dummy), + dissemination_weight = missing(dissemination_weight), + sampling_weight = missing(sampling_weight) + ) + if(any(is_missing)) stop( + "The following arguments are missing: ", + paste(names(which(is_missing)), collapse = ", "), "." + ) + inconsistency <- list( + nrc_weight_but_no_response_dummy = !is.null(nrc_weight) && is.null(response_dummy), + resp_or_nrc_dummy_but_no_nrc_weight = is.null(nrc_weight) && (!is.null(response_dummy) || !is.null(nrc_dummy)), + calibration_weight_but_no_calibration_var = !is.null(calibration_weight) && is.null(calibration_var), + calibration_or_calibration_var_but_no_calibration_weight = is.null(calibration_weight) && (!is.null(calibration_dummy) || !is.null(calibration_var)) + ) + if(any(unlist(inconsistency))) stop( + "Some arguments are inconsistent:", + if(inconsistency$nrc_weight_but_no_response_dummy) + "\n - weights after non-response correction are provided (nrc_weight argument) but no variable indicating responding units (response_dummy argument)" else "", + if(inconsistency$resp_or_nrc_dummy_but_no_nrc_weight) + "\n - a variable indicating responding units and/or a variable indicating the units taking part in the non-response correction process are provided (response_dummy and nrc_dummy argument) but no weights after non-response correction (nrc_weight argument)." else "" , + if(inconsistency$calibration_weight_but_no_calibration_var) + "\n - calibrated weights are provided (calibration_weight argument) but no calibration variables (calibration_var argument)" else "" , + if(inconsistency$calibration_or_calibration_var_but_no_calibration_weight) + "\n - a variable indicating the units taking part in a calibration process and/or calibration variables are provided (calibration_dummy and calibration_var arguments) but no calibrated weights (calibration_weight argument)" else "" + ) + + # Step 1.2: Welcome message + message( + "Survey variance estimation with the gustave package", + "\n\nThe following features are taken into account:", + if(!is.null(strata)) "\n - stratified simple random sampling" else + "\n - simple random sampling WITHOUT stratification", + if(!is.null(scope_dummy)) "\n - out-of-scope units" else "", + if(!is.null(nrc_weight)) "\n - non-response correction through reweighting" else "", + if(!is.null(calibration_weight)) "\n - calibration on margins" else "", + "\n" + ) + + # Step 2: Control that arguments do exist and retrieve their value ---- + + # Step 2.1: Evaluation of all arguments + deparse_data <- deparse(substitute(data)) + data <- eval(substitute(data), envir = envir) + if(!is.data.frame(data)) stop("data argument must refer to a data.frame") + arg <- lapply(as.list(match.call())[-1], eval) + + # Step 2.2: Expected length + should_be_single_variable_name <- intersect(c( + "id", "dissemination_dummy", "dissemination_weight", + "sampling_weight", "strata", "scope_dummy", + "nrc_weight", "response_dummy", "nrc_dummy", + "calibration_weight", "calibration_dummy" + ), names(arg)) + should_be_variable_name_vector <- intersect(c("calibration_var"), names(arg)) + should_be_variable_name <- c(should_be_single_variable_name, should_be_variable_name_vector) + + # Step 2.3: Check whether arguments are character vectors and + # have the expected length + is_single_variable_name <- sapply( + arg[should_be_single_variable_name], + function(arg) is.null(arg) || is_variable_name(arg, max_length = 1) + ) + if(any(!is_single_variable_name)) stop( + "The following arguments do not refer to a variable name (character vector of length 1): ", + names(is_single_variable_name)[!is_single_variable_name] + ) + is_variable_name_vector <- sapply( + arg[should_be_variable_name_vector], + function(arg) is.null(arg) || is_variable_name(arg, max_length = Inf) + ) + if(any(!is_variable_name_vector)) stop( + "The following arguments do not refer to a vector of variable names: ", + names(is_variable_name_vector)[!is_variable_name_vector] + ) + + # Step 2.4: Check the presence of the variables in data + is_not_in_data <- lapply(should_be_variable_name, function(param){ + tmp <- variable_not_in_data(var = arg[[param]], data = data) + if(is.null(tmp)) return(NULL) + paste0("\n - ", param, " argument: ", paste0(tmp, collapse = " ")) + }) + if(length(unlist(is_not_in_data)) > 0) stop( + "Some variables do not exist in ", deparse_data, ": ", + unlist(is_not_in_data[!is.null(is_not_in_data)]) + ) + + # Step 2.5: Retrieve the value of the arguments + data <- data[order(data[[arg$id]]), ] + list2env(c( + lapply( + arg[should_be_single_variable_name], function(param) + stats::setNames(data[[param]], data[[arg$id]]) + ), + lapply(arg[should_be_variable_name_vector], function(param){ + tmp <- data[param] + row.names(tmp) <- data[[arg$id]] + tmp + }) + ), envir = environment()) + + + # Step 3: Control arguments value ---- + + # id + if(anyNA(id)) + stop("The id variable (", arg$id, ") should not contain any missing (NA) values.") + if(any(duplicated(id))) + stop("The id variable (", arg$id, ") should not contain any duplicated values.") + + # dissemination_dummy + if(is.numeric(dissemination_dummy)){ + note("The dissemination dummy variable (", arg$dissemination_dummy, ") is of type numeric. It is automatically coerced to logical.") + # TODO: Check whether there is only 1/0 values and do it for all dummies (check_dummy() function) + dissemination_dummy <- as.logical(dissemination_dummy) + } + if(!is.logical(dissemination_dummy)) + stop("The dissemination dummy variable (", arg$dissemination_dummy, ") should be of type logical or numeric.") + if(anyNA(dissemination_dummy)) + stop("The dissemination dummy variable (", arg$dissemination_dummy, ") should not contain any missing (NA) values.") + + # dissemination_weight + if(!is.numeric(dissemination_weight)) + stop("The dissemination weights (", arg$dissemination_weight, ") should be numeric.") + if(anyNA(dissemination_weight[dissemination_dummy])) stop( + "The dissemination weights (", arg$dissemination_weight, ") should not contain ", + "any missing (NA) values for disseminated units (", arg$dissemination_dummy, ")." + ) + + # sampling_weight + if(!is.numeric(sampling_weight)) + stop("The sampling weights (", arg$sampling_weight, ") should be numeric.") + if(anyNA(sampling_weight)) + stop("The sampling weights (", arg$sampling_weight, ") should not contain any missing (NA) values.") + + # strata + if(is.null(strata)) strata <- stats::setNames(factor(rep("1", length(id))), id) + if(!is.null(strata)){ + if(is.character(strata)){ + note("The strata variable (", arg$strata, ") is of type character. It is automatically coerced to factor.") + strata <- factor(strata) + } + if(!is.factor(strata)) + stop("The strata variable (", arg$strata, ") should be of type factor or character.") + if(anyNA(strata)) + stop("The strata variable (", arg$strata, ") should not contain any missing (NA) values.") + } + + # scope_dummy + if(is.null(scope_dummy)) scope_dummy <- rep(TRUE, length(id)) else{ + if(is.numeric(scope_dummy)){ + note("The scope dummy variable (", arg$scope_dummy, ") is of type numeric. It is automatically coerced to logical.") + scope_dummy <- as.logical(scope_dummy) + } + if(!is.logical(scope_dummy)) + stop("The scope dummy variable (", arg$scope_dummy, ") should be of type logical or numeric.") + if(anyNA(scope_dummy)) + stop("The scope dummy variable (", arg$scope_dummy, ") should not contain any missing (NA) values.") + disseminated_out_of_scope <- id[dissemination_dummy & !scope_dummy] + if(length(disseminated_out_of_scope) > 0) stop( + "The following units are out-of-scope (", arg$scope_dummy, ") but nonetheless disseminated (", + arg$dissemination_dummy, "): ", display_only_n_first(disseminated_out_of_scope), "." + ) + } + + # response_dummy + if(is.null(response_dummy)) response_dummy <- scope_dummy else{ + if(is.numeric(response_dummy)){ + note("The response dummy variable (", arg$response_dummy, ") is of type numeric. It is automatically coerced to logical.") + response_dummy <- as.logical(response_dummy) + } + if(!is.logical(response_dummy)) + stop("The response dummy variable (", arg$response_dummy, ") should be of type logical or numeric.") + if(anyNA(response_dummy)) + stop("The response dummy variable (", arg$response_dummy, ") should not contain any missing (NA) values.") + } + + # nrc_dummy + if(is.null(nrc_dummy)) nrc_dummy <- scope_dummy else{ + if(is.numeric(nrc_dummy)){ + note("The non-reponse correction dummy variable (", arg$nrc_dummy, ") is of type numeric. It is automatically coerced to logical.") + nrc_dummy <- as.logical(nrc_dummy) + } + if(!is.logical(nrc_dummy)) + stop("The non-reponse correction dummy variable (", arg$nrc_dummy, ") should be of type logical or numeric.") + if(anyNA(nrc_dummy)) + stop("The non-reponse correction dummy variable (", arg$nrc_dummy, ") should not contain any missing (NA) values.") + } + + # nrc_weight + if(!is.null(nrc_weight)){ + if(!is.numeric(nrc_weight)) + stop("The weights after non-response correction (", arg$nrc_weight, ") should be numeric.") + if(anyNA(nrc_weight[response_dummy %in% TRUE & nrc_dummy %in% TRUE])) stop( + "The weights after non-response correction (", arg$nrc_weight, ") should not contain any missing (NA) values ", + "for responding units (", arg$response_dummy, ") having taken part in the non-reponse correction process (", arg$nrc_dummy, ")." + ) + + } + + + # calibration_dummy + if(is.null(calibration_dummy) && !is.null(calibration_weight)) calibration_dummy <- response_dummy + if(!is.null(calibration_dummy)){ + if(is.numeric(calibration_dummy)){ + note("The dummy variable indicating the units used in the calibation process (", arg$calibration_dummy, ") is of type numeric. It is automatically coerced to logical.") + calibration_dummy <- as.logical(calibration_dummy) + } + if(!is.logical(calibration_dummy)) + stop("The dummy variable indicating the units used in the calibation process (", arg$calibration_dummy, ") should be of type logical or numeric.") + if(anyNA(calibration_dummy)) + stop("The dummy variable indicating the units used in the calibation process (", arg$calibration_dummy, ") should not contain any missing (NA) values.") + } + + # calibration_weight + if(!is.null(calibration_weight)){ + if(!is.numeric(calibration_weight)) + stop("The weights after calibration (", arg$calibration_weight, ") should be numeric.") + if(anyNA(calibration_weight[calibration_dummy %in% TRUE])) + stop("The weights after calibration (", arg$calibration_weight, ") should not contain any missing (NA) values for units used in the calibration process.") + } + + # calibration_var + if(!is.null(calibration_var)){ + calibration_var_quanti <- names(which(sapply(calibration_var, function(var) is.numeric(var) || is.logical(var)))) + calibration_var_quali <- names(which(sapply(calibration_var, function(var) is.factor(var) || is.character(var)))) + calibration_var_pb_type <- setdiff(arg$calibration_var, c(calibration_var_quanti, calibration_var_quali)) + if(length(calibration_var_pb_type) > 0) stop( + "The following calibration variables are neither quantitative (numeric, logical) nor qualitative (factor, character): ", + display_only_n_first(calibration_var_pb_type), "." + ) + if(length(calibration_var_quali) > 0) note( + "Note: The following calibration variables are qualitative (factor, character): ", + display_only_n_first(calibration_var_quali), ". They will be automatically discretized." + ) + calibration_var_pb_NA <- names(which(sapply(calibration_var, function(var) anyNA(var[calibration_dummy %in% TRUE])))) + if(length(calibration_var_pb_NA) > 0) stop( + "The following calibration variables contain missing (NA) values for units used in the calibration process: ", + display_only_n_first(calibration_var_pb_NA, collapse = " "), "." + ) + } + + # Step 4: Define methodological quantities ---- + + samp_exclude <- stats::setNames(rep(FALSE, length(id)), id) + + # Logical controls + inconsistency <- list( + out_of_scope_and_responding = id[!scope_dummy & response_dummy] + ) + if(any(sapply(inconsistency, length) > 0)) stop( + "Some arguments are inconsistent:", + if(length(inconsistency$out_of_scope_and_responding) > 0) paste0( + "\n - the following units are classified both as out-of-scope units (", + arg$scope_dummy, " variable) and as responding units (", arg$response_dummy, + " variable): ", display_only_n_first(inconsistency$out_of_scope_and_responding), "." + ) + ) + + # Exclude strata with only one sampled unit + strata_with_one_sampled_unit <- + names(which(tapply(id[!samp_exclude], strata[!samp_exclude], length) == 1)) + if(length(strata_with_one_sampled_unit) > 0){ + warn( + "The following strata contain less than two sampled units: ", + display_only_n_first(strata_with_one_sampled_unit), ". ", + "They are excluded from the variance estimation process (but kept for point estimates)." + ) + samp_exclude <- samp_exclude | as.character(strata) %in% strata_with_one_sampled_unit + } + + # Enforce equal probabilities in each stratum + sampling_weight_equal <- sampling_weight + strata_with_unequal_sampling_weight <- + names(which(tapply(sampling_weight_equal[!samp_exclude], strata[!samp_exclude], stats::sd) > 1e-6)) + if(length(strata_with_unequal_sampling_weight) > 0){ + # TODO: Enhance warning message when strata = NULL + warn( + "The following strata contain units whose sampling weights are not exactly equal: ", + display_only_n_first(strata_with_unequal_sampling_weight), ". ", + "The mean weight per stratum is used instead." + ) + sampling_weight_equal[!samp_exclude] <- + tapply(sampling_weight_equal, strata, base::mean)[as.character(strata[!samp_exclude])] + } + + # Reference id and reference weight + guessed_weight <- sampling_weight + if(!is.null(nrc_weight)) guessed_weight[response_dummy & nrc_dummy] <- nrc_weight[response_dummy & nrc_dummy] + if(!is.null(calibration_weight)) guessed_weight[calibration_dummy] <- calibration_weight[calibration_dummy] + guessed_weight_not_matching_dissemination_weight <- id[dissemination_dummy & guessed_weight != dissemination_weight] + if(length(guessed_weight_not_matching_dissemination_weight)) stop( + "The following units have a disseminated weight (", arg$dissemination_weight, + ") that does not match the one guessed from the survey description: ", + display_only_n_first(guessed_weight_not_matching_dissemination_weight), "." + ) + reference_id <- id[dissemination_dummy] + reference_weight <- dissemination_weight[dissemination_dummy] + + # Sampling + samp <- list() + samp$id <- id + samp$exclude <- samp_exclude[samp$id] + samp$weight <- sampling_weight_equal[samp$id] + samp$strata <- strata[samp$id] + samp$precalc <- suppressMessages(with(samp, var_srs( + y = NULL, pik = 1 / weight[!exclude], strata = strata[!exclude] + ))) + samp <- samp[c("id", "exclude", "precalc")] + + # Non-reponse + if(!is.null(nrc_weight)){ + nrc <- list() + nrc$id <- id[response_dummy] + nrc$sampling_weight <- sampling_weight[nrc$id] + nrc$response_prob <- (sampling_weight / nrc_weight)[nrc$id] + }else nrc <- NULL + + # Calibration + if(!is.null(calibration_weight)){ + calib <- list() + calib$id <- id[response_dummy & calibration_dummy] + calib$weight <- calibration_weight[calib$id] + calib$var <- calibration_var[calib$id, , drop = FALSE] + calib$var[calibration_var_quanti] <- + lapply(calib$var[calibration_var_quanti], Matrix) + calib$var[calibration_var_quali] <- + lapply(calib$var[calibration_var_quali], discretize_qualitative_var) + calib$var <- do.call(cbind, calib$var) + # TODO: Handle the node stack overflow problem + calib$precalc <- res_cal(y = NULL, x = calib$var, w = calib$weight) + calib <- calib[c("id", "precalc")] + }else calib <- NULL + + + + # Step 5: Define the variance wrapper ---- + qvar_variance_wrapper <- define_variance_wrapper( + variance_function = qvar_variance_function, + reference_id = reference_id, + reference_weight = reference_weight, + default_id = arg$id, + technical_data = list(samp = samp, nrc = nrc, calib = calib) + ) + + qvar_variance_wrapper + +} + + +qvar_variance_function <- function(y, samp, nrc, calib){ + + var <- list() + + # Calibration + if(!is.null(calib)){ + y <- add_zero(y, calib$id, remove = FALSE) + y[calib$id, ] <- res_cal(y = y[calib$id, , drop = FALSE], precalc = calib$precalc) + } + + # Non-response + if(!is.null(nrc)){ + var[["nr"]] <- var_pois(y[nrc$id, , drop = FALSE], pik = nrc$response_prob, w = nrc$sampling_weight) + y[nrc$id, ] <- as.matrix(Diagonal(x = 1 / nrc$response_prob) %*% y[nrc$id, , drop = FALSE]) + } + + # Sampling + y <- add_zero(y, rownames = samp$id) + var[["sampling"]] <- var_srs(y = y[!samp$exclude, , drop = FALSE], precalc = samp$precalc) + + # Final summation + Reduce(`+`, var) + +} + + + diff --git a/R/standard_statistic_wrapper.R b/R/standard_statistic_wrapper.R new file mode 100644 index 0000000..162299d --- /dev/null +++ b/R/standard_statistic_wrapper.R @@ -0,0 +1,121 @@ +#' Standard statistic wrappers +#' +#' @description Functions to be used within variance estimation +#' wrappers in order to specify which statistic is to be estimated. +#' +#' @param y A vector corresponding to the variable to calculate the statitic on. +#' If not numeric (character or factor), it is automatically discretized. +#' @param num,num1,num2 Numerical vector(s) corresponding to the numerator(s) +#' to be used in the estimation. +#' @param denom,denom1,denom2 Numerical vector(s) corresponding to the denominator(s) +#' to be used in the estimation. +#' @param by Factor vector (character vectors are coerced to factors) whose levels are used +#' to break down the estimation by domains. +#' @param where Logical vector indicating the domain to perform variance estimation on. +#' +#' @details When the estimator is not the estimator of a total, the application of +#' analytical variance estimation formulae developed for the estimator of a total +#' is not straightforward (Deville, 1999). An asymptotically unbiased variance +#' estimator can nonetheless be obtained if the estimation of variance is performed +#' on a variable obtained from the original data through a linerization step. +#' +#' The \code{ratio}, \code{mean}, \code{diff_of_ratio} and \code{ratio_of_ratio} +#' functions produce the point estimate of the statistic and derive the +#' corresponding linearized variable which is later on passed on to the variance +#' estimation function within the variance estimation wrapper. +#' +#' Note: The \code{total} function does not perform any linearization +#' (as none is needed for the estimator of a total) and solely produces the +#' corresponding point estimator. +#' +#' @seealso \code{\link{define_statistic_wrapper}}, \code{\link{define_variance_wrapper}} +#' +#' @references +#' Caron N. (1998), "Le logiciel Poulpe : aspects méthodologiques", \emph{Actes +#' des Journées de méthodologie statistique} \url{http://jms-insee.fr/jms1998s03_1/} +#' +#' Deville J.-C. (1999), "Variance estimation for complex statistics and +#' estimators: linearization and residual techniques", \emph{Survey Methodology}, +#' 25:193–203 +#' +#' @examples # See qvar examples +#' +#' @author Martin Chevalier +#' +#' @name standard_statistic_wrapper +#' @aliases total ratio mean diff_of_ratio ratio_of_ratio + +NULL + +#' @rdname standard_statistic_wrapper +total <- define_statistic_wrapper( + statistic_function = function(y, weight){ + na <- is.na(y) + y[na] <- 0 + point <- sum(y * weight) + list(point = point, lin = y, n = sum(!na)) + }, + arg_type = list(data = "y" , weight = "weight") +) + +#' @rdname standard_statistic_wrapper +ratio <- define_statistic_wrapper( + statistic_function = function(num, denom, weight){ + na <- is.na(num) | is.na(denom) + num[na] <- 0 + denom[na] <- 0 + est_num <- sum(num * weight) + est_denom <- sum(denom * weight) + point <- est_num / est_denom + lin <- (num - point * denom ) / est_denom + list(point = point, lin = lin, n = sum(!na), est_num = est_num, est_denom = est_denom) + }, + arg_type = list(data = c("num", "denom") , weight = "weight") +) + +#' @rdname standard_statistic_wrapper +mean <- define_statistic_wrapper( + statistic_function = function(y, weight){ + environment(ratio)$statistic_function(num = y, denom = rep(1, length(y)), weight = weight) + }, + arg_type = list(data = "y" , weight = "weight") +) + +#' @rdname standard_statistic_wrapper +diff_of_ratio <- define_statistic_wrapper( + statistic_function = function(num1, denom1, num2, denom2, weight){ + na <- is.na(num1) | is.na(denom1) | is.na(num2) | is.na(denom2) + num1[na] <- 0 + denom1[na] <- 0 + num2[na] <- 0 + denom2[na] <- 0 + ratio1 <- environment(ratio)$statistic_function(num = num1, denom = denom1, weight = weight) + ratio2 <- environment(ratio)$statistic_function(num = num2, denom = denom2, weight = weight) + point <- ratio1$point - ratio2$point + lin <- ratio1$lin - ratio2$lin + list(point = point, lin = lin, n = sum(!na)) + }, + arg_type = list(data = c("num1", "denom1", "num2", "denom2") , weight = "weight") +) + +#' @rdname standard_statistic_wrapper +ratio_of_ratio <- define_statistic_wrapper( + statistic_function = function(num1, denom1, num2, denom2, weight){ + na <- is.na(num1) | is.na(denom1) | is.na(num2) | is.na(denom2) + num1[na] <- 0 + denom1[na] <- 0 + num2[na] <- 0 + denom2[na] <- 0 + est_num1 <- sum(num1 * weight) + est_denom1 <- sum(denom1 * weight) + est_num2 <- sum(num2 * weight) + est_denom2 <- sum(denom2 * weight) + point <- (est_num1 / est_denom1) / (est_num2 / est_denom2) + lin <- point * ( + (num1 / est_num1) - (num2 / est_num2) - + (denom1 / est_denom1) + (denom2 / est_denom2) + ) + list(point = point, lin = lin, n = sum(!na)) + }, + arg_type = list(data = c("num1", "denom1", "num2", "denom2") , weight = "weight") +) diff --git a/R/utils.R b/R/utils.R index 288f081..89f52ac 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1,232 +1,367 @@ - - -#' Efficient by-group (weighted) summation -#' -#' @description \code{sumby()} performs an efficient and optionally weighted -#' by-group summation by using linear algebra and the Matrix package -#' capabilities. The by-group summation is performed through matrix cross-product -#' of the y parameter (coerced to a matrix if needed) with a (very) sparse -#' matrix built up using the by and the (optional) w parameters. -#' -#' Compared to base R, dplyr or data.table alternatives, this implementation -#' aims at being easier to use in a matrix-oriented context and can yield -#' efficiency gains when the number of columns becomes high. -#' -#' @param y A (sparse) vector, a (sparse) matrix or a data.frame. -#' The object to perform by-group summation on. -#' @param by The factor variable defining the by-groups. Character variables -#' are coerced to factors. -#' @param w The optional weight to be used in the summation. -#' @param na_rm Should NA values in y be removed (ie treated as 0 in the summation) ? -#' Similar to na.rm argument in \code{\link[base]{sum}}, but TRUE by default. -#' If FALSE, NA values in y produce NA values in the result. -#' @param keep_sparse When y is a sparse vector or a sparse matrix, should the result -#' also be sparse ? FALSE by default. As \code{\link[Matrix]{sparseVector-class}} does -#' not have a name attribute, when y is a sparseVector the result does not have any -#' name (and a warning is cast). -#' -#' @return A vector, a matrix or a data.frame depending on the type of y. If y is -#' sparse and keep_sparse is TRUE, then the result is also sparse (without names -#' when it is a sparse vector, see keep_sparse argument for details). -#' -#' @author Martin Chevalier -#' -#' @examples # Data generation -#' set.seed(1) -#' n <- 100 -#' p <- 10 -#' H <- 3 -#' y <- matrix(rnorm(n*p), ncol = p, dimnames = list(NULL, paste0("var", 1:10))) -#' y[1, 1] <- NA -#' by <- letters[sample.int(H, n, replace = TRUE)] -#' w <- rep(1, n) -#' w[by == "a"] <- 2 -#' -#' # Standard use -#' sumby(y, by) -#' -#' # Keeping the NAs -#' sumby(y, by, na_rm = FALSE) -#' -#' # With a weight -#' sumby(y, by, w = w) -#' -#' @export -#' @import Matrix - -sumby <- function(y, by, w = NULL, na_rm = TRUE, keep_sparse = FALSE){ - - # y <- V - - # Type of y - class_y <- class(y) - is_data.frame_y <- is.data.frame(y) - if(is_data.frame_y) y <- as.matrix(y) - is_sparse_y <- inherits(y, c("Matrix", "sparseVector")) - is_vector_y <- is.null(dim(y)) - is_numeric_y <- is.numeric(if(!is_sparse_y) y else y@x) - if(!is_numeric_y) stop("y is not numeric (or not entirely).") - if(!is_sparse_y | is_vector_y) y <- methods::as(y, "sparseMatrix") - - # Weight, NA in y - if(is.null(w)) w <- rep(1, NROW(y)) - if(!is.numeric(w)) stop("w is not numeric") - if(na_rm) y[is.na(y)] <- 0 - - # NA in by - NA_in_by <- is.na(by) - if(any(NA_in_by)){ - y <- y[!NA_in_by, , drop = FALSE] - by <- by[!NA_in_by] - w <- w[!NA_in_by] - } - - # Matrix cross-product - by <- as.factor(by) - x <- block_matrix(w, by)$y - colnames(x) <- levels(by) - r <- crossprod(x, y) - - # Type of r - if(!is_sparse_y | !keep_sparse){ - r <- if(is_vector_y) stats::setNames(as.vector(r), rownames(r)) else as.matrix(r) - }else{ - if(is_vector_y) warning("sparseVector can't have names, hence the output won't have names.") - r <- methods::as(r, class_y) - } - if(is_data.frame_y) r <- as.data.frame(r) - - r - -} - - -#' Expand a matrix or a data.frame with zeros based on rownames matching -#' -#' @description For a given two-dimensional object with rownames and a character -#' vector, \code{add0()} produces a corresponding object whose rownames match -#' the character vector, with zeros on the additional rows. -#' -#' This function is an easy-to-use and reliable way to reintroduce -#' non-responding units in the variance estimation process (after the -#' non-response phase is taken into account). -#' -#' @param y A (sparse) matrix or a data.frame. The object to add zeros to. -#' @param rownames A character vector (other types are coerced to character). -#' The character vector giving the rows of the produced object. -#' @param remove Should rows of y whose name do not appear in the rownames -#' argument be removed ? TRUE by default, a warning is shown when rows are -#' removed. -#' -#' @return A (sparse) matrix or data.frame depending on the type of y. -#' -#' @author Martin Chevalier -#' -#' @examples # Data generation -#' set.seed(1) -#' n <- 10 -#' p <- 2 -#' y <- matrix(1:(n*p), ncol = p, dimnames = list(sample(letters, n))) -#' y[c(3, 8, 12)] <- NA -#' rownames <- letters -#' -#' # Standard use -#' add0(y, rownames) -#' -#' # Use when rownames in y do not match -#' # any element in the rownames argument -#' rownames(y)[1:3] <- toupper(rownames(y)[1:3]) -#' add0(y, rownames) -#' add0(y, rownames, remove = FALSE) -#' -#' @import Matrix -#' @export -#' -add0 <- function(y, rownames, remove = TRUE){ - - # y <- m; rownames <- letters - - # Type of y - class_y <- class(y) - is_data.frame_y <- is.data.frame(y) - if(is_data.frame_y) y <- as.matrix(y) - if(is.null(dim(y))) - stop("y must be a (sparse) matrix or a data.frame.") - if(is.null(rownames(y))) - stop("y must have rownames in order to be used in add0().") - is_sparse_y <- inherits(y, c("Matrix", "sparseVector")) - is_numeric_y <- is.numeric(if(!is_sparse_y) y else y@x) - if(!is_numeric_y) stop("y is not numeric (or not entirely).") - - # Prepare rownames argument - rownames <- rownames[!is.na(rownames)] - rownames <- as.character(rownames) - - # Expand y with 0 in order to get an object whose rownames - # are the character argument rownames (in the same order) - compl <- setdiff(rownames, rownames(y)) - if(!is_sparse_y){ - r <- rbind(y, matrix(0, nrow = length(compl), ncol = NCOL(y), dimnames = list(compl))) - if(is_data.frame_y) r <- as.data.frame(r) - }else{ - r <- rbind(y, Matrix(0, nrow = length(compl), ncol = NCOL(y), dimnames = list(compl, NULL))) - r <- methods::as(r, class_y) - } - - # Remove rows that do not match any element in rownames - # if remove is TRUE - if(remove){ - if(length(setdiff(rownames(y), rownames))) - warning("The name of some rows in y do not match any element in the rownames argument. These rows are removed from the result (use remove = FALSE to change this behaviour).") - o <- rownames - }else o <- order(rownames(r)) - - r[o, , drop = FALSE] - -} - - - - -# Unexported (and undocumented) functions - -#' @import Matrix - -block_matrix <- function(y, by){ - # y <- as(Matrix(TRUE, ncol = 10, nrow = length(rowby)), "TsparseMatrix"); by <- rowby; p <- 2 - # y <- x; by <- strata - byrow <- by - by <- as.factor(by) - H <- length(levels(by)) - p <- NCOL(y) - if(!methods::is(y,"TsparseMatrix")) y <- methods::as(if(p == 1) as.matrix(y) else y, "TsparseMatrix") - y@j <- as.integer(((as.numeric(by) - 1) * p)[y@i + 1] + y@j) - y@Dim <- c(y@Dim[1], as.integer(y@Dim[2] * H)) - if(any(is.na(by))){na <- is.na(y@j); y@x <- y@x[!na]; y@i <- y@i[!na]; y@j <- y@j[!na]} - bycol <- rep(levels(by), each = p) - bycol <- if(is.factor(byrow)) as.factor(bycol) else methods::as(bycol, class(byrow)) - list(y = y, byrow = byrow, bycol = bycol) -} -# TODO: export a matrix with rowby/colby attributes instead of a list, -# add an option for row/colnames creation (with a given separator) - -change_enclosing <- function(FUN, envir = environment(FUN)){ - eval(parse(text = deparse(FUN)), envir = envir) -} - -assign_all <- function(objects, to, from = parent.frame(), not_closure = c(list(globalenv()), sys.frames())){ - for(n in objects){ - get_n <- get(n, from) - if(!is.function(get_n)){ - assign(n, get_n, envir = to) - }else{ - tmp <- new.env(parent = to) - env_n <- environment(get_n) - not_closure <- c(not_closure, from) - is_closure <- !any(sapply(not_closure, identical, env_n)) - if(is_closure) - assign_all(ls(env_n, all.names = TRUE), to = tmp, from = env_n, not_closure = not_closure) - assign(n, change_enclosing(get_n, envir = tmp), envir = to) - } - } -} \ No newline at end of file + + +#' Efficient by-group (weighted) summation +#' +#' @description \code{sum_by} performs an efficient and optionally weighted +#' by-group summation by using linear algebra and the Matrix package +#' capabilities. The by-group summation is performed through matrix cross-product +#' of the \code{y} parameter (coerced to a matrix if needed) with a (very) sparse +#' matrix built up using the \code{by} and the (optional) \code{w} parameters. +#' +#' Compared to base R, dplyr or data.table alternatives, this implementation +#' aims at being easier to use in a matrix-oriented context and can yield +#' efficiency gains when the number of columns becomes high. +#' +#' @param y A (sparse) vector, a (sparse) matrix or a data.frame. +#' The object to perform by-group summation on. +#' @param by The factor variable defining the by-groups. Character variables +#' are coerced to factors. +#' @param w The optional row weights to be used in the summation. +#' @param na_rm Should \code{NA} values in \code{y} be removed (ie treated as 0 in the summation) ? +#' Similar to \code{na.rm} argument in \code{\link[base]{sum}}, but \code{TRUE} by default. +#' If \code{FALSE}, \code{NA} values in \code{y} produce \code{NA} values in the result. +#' @param keep_sparse When \code{y} is a sparse vector or a sparse matrix, should the result +#' also be sparse ? \code{FALSE} by default. As \code{\link[Matrix]{sparseVector-class}} does +#' not have a name attribute, when \code{y} is a sparseVector the result does not have any +#' name (and a warning is cast). +#' +#' @return A vector, a matrix or a data.frame depending on the type of \code{y}. If \code{y} is +#' sparse and \code{keep_sparse = TRUE}, then the result is also sparse (without names +#' when it is a sparse vector, see keep_sparse argument for details). +#' +#' @author Martin Chevalier +#' +#' @examples # Data generation +#' set.seed(1) +#' n <- 100 +#' p <- 10 +#' H <- 3 +#' y <- matrix(rnorm(n*p), ncol = p, dimnames = list(NULL, paste0("var", 1:10))) +#' y[1, 1] <- NA +#' by <- letters[sample.int(H, n, replace = TRUE)] +#' w <- rep(1, n) +#' w[by == "a"] <- 2 +#' +#' # Standard use +#' sum_by(y, by) +#' +#' # Keeping the NAs +#' sum_by(y, by, na_rm = FALSE) +#' +#' # With a weight +#' sum_by(y, by, w = w) +#' +#' @export +#' @import Matrix + +sum_by <- function(y, by, w = NULL, na_rm = TRUE, keep_sparse = FALSE){ + + # y <- V + + # Type of y + class_y <- class(y) + is_data.frame_y <- is.data.frame(y) + if(is_data.frame_y) y <- as.matrix(y) + is_sparse_y <- inherits(y, c("Matrix", "sparseVector")) + is_vector_y <- is.null(dim(y)) + is_numeric_y <- is.numeric(if(!is_sparse_y) y else y@x) + if(!is_numeric_y) stop("y is not numeric (or not entirely).") + if(!is_sparse_y | is_vector_y) y <- methods::as(y, "sparseMatrix") + + # Weight, NA in y + if(is.null(w)) w <- rep(1, NROW(y)) + if(!is.numeric(w)) stop("w is not numeric") + if(na_rm) y[is.na(y)] <- 0 + + # NA in by + NA_in_by <- is.na(by) + if(any(NA_in_by)){ + y <- y[!NA_in_by, , drop = FALSE] + by <- by[!NA_in_by] + w <- w[!NA_in_by] + } + + # Matrix cross-product + by <- as.factor(by) + x <- make_block(w, by) + colnames(x) <- levels(by) + r <- crossprod(x, y) + + # Type of r + if(!is_sparse_y | !keep_sparse){ + r <- if(is_vector_y) stats::setNames(as.vector(r), rownames(r)) else as.matrix(r) + }else{ + if(is_vector_y) warn("sparseVector can't have names, hence the output won't have names.") + r <- methods::as(r, class_y) + } + if(is_data.frame_y) r <- as.data.frame(r) + + r + +} + + +#' Expand a matrix or a data.frame with zeros based on rownames matching +#' +#' @description For a given two-dimensional object with rownames and a character +#' vector, \code{add_zero} produces a corresponding object whose rownames match +#' the character vector, with zeros on the additional rows. +#' +#' This function is an easy-to-use and reliable way to reintroduce +#' non-responding units in the variance estimation process (after the +#' non-response phase is taken into account). +#' +#' @param y A (sparse) matrix or a data.frame. The object to add zeros to. +#' @param rownames A character vector (other types are coerced to character). +#' The character vector giving the rows of the produced object. +#' @param remove Should rows of \code{y} whose name do not appear in the rownames +#' argument be removed ? TRUE by default, a warning is shown when rows are +#' removed. +#' +#' @return A (sparse) matrix or data.frame depending on the type of \code{y}. +#' +#' @author Martin Chevalier +#' +#' @examples # Data generation +#' set.seed(1) +#' n <- 10 +#' p <- 2 +#' y <- matrix(1:(n*p), ncol = p, dimnames = list(sample(letters, n))) +#' y[c(3, 8, 12)] <- NA +#' rownames <- letters +#' +#' # Standard use +#' add_zero(y, rownames) +#' +#' # Use when rownames in y do not match +#' # any element in the rownames argument +#' rownames(y)[1:3] <- toupper(rownames(y)[1:3]) +#' add_zero(y, rownames) +#' add_zero(y, rownames, remove = FALSE) +#' +#' @import Matrix +#' @export +#' +add_zero <- function(y, rownames, remove = TRUE){ + + # y <- m; rownames <- letters + + # Type of y + class_y <- class(y) + is_data.frame_y <- is.data.frame(y) + if(is_data.frame_y) y <- as.matrix(y) + if(is.null(dim(y))) + stop("y must be a (sparse) matrix or a data.frame.") + if(is.null(rownames(y))) + stop("y must have rownames in order to be used in add_zero().") + is_sparse_y <- inherits(y, c("Matrix", "sparseVector")) + is_numeric_y <- is.numeric(if(!is_sparse_y) y else y@x) + if(!is_numeric_y) stop("y is not numeric (or not entirely).") + + # Prepare rownames argument + rownames <- rownames[!is.na(rownames)] + rownames <- as.character(rownames) + + # Expand y with 0 in order to get an object whose rownames + # are the character argument rownames (in the same order) + compl <- setdiff(rownames, rownames(y)) + if(!is_sparse_y){ + r <- rbind(y, matrix(0, nrow = length(compl), ncol = NCOL(y), dimnames = list(compl))) + if(is_data.frame_y) r <- as.data.frame(r) + }else{ + r <- rbind(y, Matrix(0, nrow = length(compl), ncol = NCOL(y), dimnames = list(compl, NULL))) + r <- methods::as(r, class_y) + } + + # Remove rows that do not match any element in rownames + # if remove is TRUE + if(remove){ + if(length(setdiff(rownames(y), rownames))) + warn("The name of some rows in y do not match any element in the rownames argument. These rows are removed from the result (use remove = FALSE to change this behaviour).") + o <- rownames + }else o <- order(rownames(r)) + + r[o, , drop = FALSE] + +} + + +# TODO: Export and document make_block() +make_block <- function(y, by){ + + # Step 1: Prepare the by argument + by <- droplevels(as.factor(by)) + H <- length(levels(by)) + if(H == 1) return(y) + + # Step 2: Coerce y to a TsparseMatrix and remove NA values + res <- coerce_to_TsparseMatrix(y) + if(any(is.na(by))){ + na <- is.na(res@j) + res@x <- res@x[!na] + res@i <- res@i[!na] + res@j <- res@j[!na] + } + + # Step 3: Adjust the y and Dim slots in order to obtain the block matrix + p <- NCOL(res) + res@Dimnames[2] <- list(NULL) + res@j <- as.integer(((as.numeric(by) - 1) * p)[res@i + 1] + res@j) + res@Dim <- c(res@Dim[1], as.integer(res@Dim[2] * H)) + + # Step 4: Export the result with relevant attributes + attr(res, "rowby") <- as.character(by) + attr(res, "colby") <- as.character(rep(levels(by), each = p)) + res + +} + + +# Unexported (and undocumented) functions + +# From devtools (https://github.com/r-lib/devtools/blob/master/R/utils.r) +"%||%" <- function(a, b) if (!is.null(a)) a else b + +coerce_to_TsparseMatrix <- function(y){ + if(is.null(dim(y))){ + names_y <- names(y) + res <- Matrix::sparseMatrix( + x = unname(y), i = seq_along(y), j = rep(1, length(y)), giveCsparse = FALSE + ) + if(!is.null(names_y)) rownames(res) <- names_y + }else if(!methods::is(y,"TsparseMatrix")){ + dimnames_y <- dimnames(y) + res <- methods::as(y, "TsparseMatrix") + if(!is.null(dimnames_y)) dimnames(res) <- dimnames_y + }else res <- y + res +} + +detect_block <- function(y, by){ + by <- droplevels(as.factor(by)) + y_bool <- coerce_to_TsparseMatrix(y) != 0 + by_bool <- make_block(rep(TRUE, NROW(y)), by) + prod <- crossprod(by_bool, y_bool) + prod_bool <- prod > 0 + if(!all(colSums(prod_bool) <= 1)) return(NULL) + attr(y, "rowby") <- as.character(by) + attr(y, "colby") <- rep(levels(by), NCOL(prod_bool))[as.vector(prod_bool)] + y +} + + +change_enclosing <- function(FUN, envir = environment(FUN)){ + eval(parse(text = deparse(FUN)), envir = envir) +} + +assign_all <- function(objects, to, from = parent.frame(), not_closure = c(list(globalenv()), sys.frames())){ + for(n in objects){ + get_n <- get(n, from) + if(!is.function(get_n)){ + assign(n, get_n, envir = to) + }else{ + tmp <- new.env(parent = to) + env_n <- environment(get_n) + not_closure <- c(not_closure, from) + is_closure <- !any(sapply(not_closure, identical, env_n)) + if(is_closure) + assign_all(ls(env_n, all.names = TRUE), to = tmp, from = env_n, not_closure = not_closure) + assign(n, change_enclosing(get_n, envir = tmp), envir = to) + } + } +} + +is_error <- function(expr) + inherits(try(expr, silent = TRUE), "try-error") + +is_variable_name <- function(param, data = NULL, max_length = 1) + is.character(param) && + (is.null(data) || length(setdiff(param, names(data))) == 0) && + length(param) > 0 && length(param) <= max_length + +variable_not_in_data <- function(var, data){ + if(is.null(var)) return(NULL) + tmp <- var[!(var %in% names(data))] + if(length(tmp) == 0) return(NULL) + tmp +} + +replace_variable_name_with_symbol <- function(arg_list, data, single = TRUE){ + # TODO: Allow consistent evaluation through parent frames + # TODO: Handle the case of apparent name without match in data variable names + tmp <- lapply(arg_list, function(a){ + if(is_error(a_eval <- eval(a, envir = data))){ + a_out <- list(a) + }else if(is_variable_name(a_eval, data = data, max_length = Inf)){ + if(single && !is_variable_name(a_eval, data = data, max_length = 1)) + stop("Only single variable names are allowed for the by argument.") + a_out <- lapply(a_eval, as.symbol) + }else a_out <- list(a) + a_out + }) + if(!single){ + tmp_length <- sapply(tmp, length) + if(!all(tmp_length %in% c(1, max(tmp_length)))) + stop("Some arguments have longer variable vectors than others.") + tmp[tmp_length == 1] <- + lapply(tmp[tmp_length == 1], `[`, rep(1, max(tmp_length))) + }else if(length(tmp) == 1) tmp[1] <- tmp[[1]] + tmp +} + +warn <- function(...) warning(..., "\n", call. = FALSE, immediate. = TRUE) +note <- function(...) message("Note: ", ..., "\n") + +is_statistic_wrapper <- function(x) inherits(x, "gustave_statistic_wrapper") + +names_else_NA <- function(x){ + if(is.null(names(x))) rep(NA, length(x)) else{ + tmp <- names(x) + tmp[tmp %in% ""] <- NA + tmp + } +} + +discretize_qualitative_var <- function(var, logical = FALSE){ + var <- droplevels(as.factor(var)) + result <- Matrix(nrow = length(var), ncol = length(levels(var))) + result[!is.na(var), ] <- Matrix::sparse.model.matrix(~ var - 1) + result[is.na(var), ] <- NA + if(!logical) result <- result * 1 + rownames(result) <- names(var) + colnames(result) <- levels(var) + result +} + +get_through_parent_frame <- function(x){ + n <- 0 + found <- NULL + while(is.null(found) || identical(baseenv(), parent.frame(n))){ + n <- n + 1 + found <- get0("execution_envir", parent.frame(n)) + } + found +} + +display_only_n_first <- function(x, + n = 10, + collapse = ", ", + final_text = paste0(" and ", length(x) - n, " more") +){ + if(length(x) <= n){ + paste(x, collapse = collapse) + }else{ + paste0(paste(x[1:n], collapse = collapse), final_text) + } +} + +rbind_output_df <- function(list_output_df){ + names <- unique(do.call(base::c, lapply(list_output_df, names))) + output_df <- do.call(rbind, lapply(list_output_df, function(i){ + i[, setdiff(names, names(i))] <- NA + i[, names] + })) + output_df <- output_df[, sapply(output_df, function(i) !all(is.na(i)))] + rownames(output_df) <- NULL + output_df +} diff --git a/R/variance_function.R b/R/variance_function.R index ca508d9..0928ff4 100644 --- a/R/variance_function.R +++ b/R/variance_function.R @@ -1,483 +1,472 @@ - - -#' Linear Regression Residuals Calculation -#' -#' @description \code{rescal} calculates linear regression residuals in an -#' efficient way : handling several dependent variables at a time, using -#' Matrix::TsparseMatrix capabilities and allowing for pre-calculation of -#' the matrix inverse. -#' -#' @param y A numerical matrix of dependent variable(s). May be a -#' Matrix::TsparseMatrix. -#' @param x A numerical matrix of independent variable(s). May be a -#' Matrix::TsparseMatrix. -#' @param w An optional numerical vector of row weights. -#' @param by An optional categorical vector (factor or character) -#' when residuals calculation is to be conducted within by-groups -#' (see Details). -#' @param collinearity.check A boolean (\code{TRUE} or \code{FALSE}) or -#' \code{NULL} indicating whether to perform a check for collinearity or -#' not (see Details). -#' @param precalc A list of pre-calculated results (see Details). -#' -#' @details In the context of the \code{gustave} package, linear -#' regression residual calculation is solely used to take into account -#' the effect of calibration on variance estimation. Independent variables -#' are therefore most likely to be the same from one variance estimation -#' to another, hence the inversion of the matrix -#' \code{t(x) \%*\% Diagonal(x = w) \%*\% x} can be done once and for all -#' at a pre-calculation step. -#' -#' The parameters \code{y} and \code{precalc} determine whether a list of -#' pre-calculated data should be used in order to speed up the regression -#' residuals computation at execution time: -#' \itemize{ -#' \item if \code{y} not \code{NULL} and \code{precalc} \code{NULL} : -#' on-the-fly calculation of the matrix inverse and the regression residuals -#' (no pre-calculation). -#' \item if \code{y} \code{NULL} and \code{precalc} \code{NULL} : -#' pre-calculation of the matrix inverse which is stored in a list of -#' pre-calculated data. -#' \item if \code{y} not \code{NULL} and \code{precalc} not \code{NULL} : -#' calculation of the regression residuals using the list of pre-calculated -#' data. -#' } -#' -#' The \code{by} parameter allows for calculation within by-groups : all -#' calculation are made separately for each by-group (when calibration was -#' conducted separately on several subsamples), but in an efficient way using -#' Matrix::TsparseMatrix capabilities (especially when the matrix inverse is -#' pre-calculated). -#' -#' If \code{collinearity.check} is \code{NULL}, a test for collinearity in the -#' independent variables (\code{x}) is conducted if and only if \code{det(t(x) -#' \%*\% x) == 0}. -#' -#' -#' @return \itemize{ \item if \code{y} is not \code{NULL} (calculation step) : a -#' numerical matrix with same structure (regular base::matrix or -#' Matrix::TsparseMatrix) and dimensions as \code{y}. \item if \code{y} is -#' \code{NULL} (pre-calculation step) : a list containing pre-calculated data: -#' \itemize{ \item \code{x}: the numerical matrix of independent variables. -#' \item \code{w}: the numerical vector of row weights (vector of 1 by -#' default). \item \code{inv}: the inverse of \code{t(x) \%*\% -#' Matrix::Diagonal(x = w) \%*\% x} } } -#' -#' @author Martin Chevalier -#' -#' @examples # Generating random data -#' set.seed(1) -#' n <- 100 -#' H <- 5 -#' y <- matrix(rnorm(2*n), nrow = n) -#' x <- matrix(rnorm(10*n), nrow = n) -#' by <- letters[sample(1:H, n, replace = TRUE)] -#' -#' # Direct calculation -#' rescal(y, x) -#' -#' # Calculation with pre-calculated data -#' precalc <- rescal(y = NULL, x) -#' rescal(y, precalc = precalc) -#' identical(rescal(y, x), rescal(y, precalc = precalc)) -#' -#' # Collinearity check -#' rescal(y, cbind(x, x[, 1]), collinearity.check = TRUE) -#' -#' # Matrix::TsparseMatrix capability -#' require(Matrix) -#' X <- as(x, "TsparseMatrix") -#' Y <- as(y, "TsparseMatrix") -#' rescal(Y, X) -#' -#' # by parameter for within by-groups calculation -#' rescal(Y, X, by = by) -#' identical( -#' rescal(Y, X, by = by)[by == "a", ] -#' , rescal(Y[by == "a", ], X[by == "a", ]) -#' ) -#' -#' @export rescal - -rescal <- function(y = NULL, x, w = NULL, by = NULL, collinearity.check = NULL, precalc = NULL){ - - if(is.null(precalc)){ - - if(is.null(w)) w <- rep(1, NROW(x)) - - # Taking the by into account - if(!is.null(by)) x <- block_matrix(x, by)$y - - # Checking for collinearity - if(isTRUE(collinearity.check) || (is.null(collinearity.check) && det(t(x) %*% x) == 0)){ - t <- as.vector(is.na(stats::lm(rep(1, NROW(x)) ~ . - 1, data = as.data.frame(as.matrix(x)))$coef)) - if(any(t)) warning("Some variables in x where discarted due to collinearity.") - x <- x[, !t] - } - - # Matrix inversion - inv <- solve(t(x) %*% Matrix::Diagonal(x = w) %*% x) - - }else list2env(precalc, envir = environment()) - - if(is.null(y)){ - return(list(x = x, w = w, inv = inv)) - }else{ - e <- y - x %*% ( inv %*% (t(x) %*% Matrix::Diagonal(x = w) %*% y) ) - if(class(e) != class(y)) e <- methods::as(e, class(y)) - dimnames(e) <- dimnames(y) - return(e) - } - -} - -# n <- 70000; p <- 10; q <- 150; H <- 6; y <- matrix(rnorm(n*p),ncol=p); x <- Matrix(rnorm(n*q)*(runif(n*q) > 0.98),ncol=q); w <- runif(n); by <- rep(1:H,n %/% H + 1)[1:n][sample.int(n)]; -# precalc <- rescal(y = NULL, x = x, w = w, by = by) -# microbenchmark(times = 10, rescal(y, precalc = precalc), rescal(y, x = x, w = w, by = by)) -# inv <- ginv(as.matrix(t(x * w) %*% x)) -# t2 <- resCalib(y,x,w,inv) -# identical(t,t2) -# microbenchmark(rescal(y,x,w),rescal(y,x,w,inv),times = 10) - -#' Variance approximation with Deville-Tillé (2005) formula -#' -#' @aliases varDT var_srs -#' -#' @description \code{varDT} estimates the variance of the estimator of a total -#' in the case of a balanced sampling design with equal or unequal probabilities. -#' Without balancing variables, it falls back to Deville's (1993) classical -#' approximation. Without balancing variables and with equal probabilities, it -#' falls back to the classical Horvitz-Thompson variance estimator for the total in -#' the case of simple random sampling. Stratification is natively supported. -#' -#' \code{var_srs} is a convenience wrapper for the (stratified) simple random -#' sampling case. -#' -#' @param y A numerical matrix of the variable(s) whose variance of their total -#' is to be estimated. May be a Matrix::TsparseMatrix. -#' @param pik A numerical vector of first-order inclusion probabilities. -#' @param x An optional numerical matrix of balancing variable(s). May be a -#' Matrix::TsparseMatrix. -#' @param strata An optional categorical vector (factor or character) when -#' variance estimation is to be conducted within strata. -#' @param w An optional numerical vector of row weights (see Details). -#' @param collinearity.check A boolean (\code{TRUE} or \code{FALSE}) or -#' \code{NULL} indicating whether to perform a check for collinearity or not -#' (see Details). -#' @param precalc A list of pre-calculated results (see Details). -#' -#' @details \code{varDT} aims at being the workhorse of most variance estimation conducted -#' with the \code{gustave} package. It may be used to estimate the variance -#' of the estimator of a total in the case of (stratified) simple random sampling, -#' (stratified) unequal probability sampling and (stratified) balanced sampling. -#' The native integration of stratification based on Matrix::TsparseMatrix allows -#' for significant performance gains compared to higher level vectorizations -#' (\code{*apply} especially). -#' -#' Several time-consuming operations (e.g. collinearity-check, matrix -#' inversion) can be pre-calculated in order to speed up the estimation at -#' execution time. This is determined by the value of the parameters \code{y} -#' and \code{precalc}: \itemize{ \item if \code{y} not \code{NULL} and -#' \code{precalc} \code{NULL} : on-the-fly calculation (no pre-calculation). -#' \item if \code{y} \code{NULL} and \code{precalc} \code{NULL} : -#' pre-calculation whose results are stored in a list of pre-calculated data. -#' \item if \code{y} not \code{NULL} and \code{precalc} not \code{NULL} : -#' calculation using the list of pre-calculated data. } -#' -#' If \code{collinearity.check} is \code{NULL}, a test for collinearity in the -#' independent variables (\code{x}) is conducted only if \code{det(t(x) \%*\% -#' x) == 0)}. -#' -#' \code{w} is a row weight used at the final summation step. It is useful -#' when \code{varDT} or \code{var_srs} are used on the second stage of a -#' two-stage sampling design applying the Rao (1975) formula. -#' -#' @section Difference with \code{varest} from package \code{sampling}: -#' -#' \code{varDT} differs from \code{sampling::varest} in several ways: -#' \itemize{ \item The formula implemented in \code{varDT} is more general and -#' encompasses balanced sampling. \item Even in its reduced -#' form (without balancing variables), the formula implemented in \code{varDT} -#' slightly differs from the one implemented in \code{sampling::varest}. -#' Caron, Deville and Sautory (1998, pp. 7-8) compares the two estimators -#' (\code{sampling::varest} implements V_2, \code{varDT} implements V_1). -#' \item \code{varDT} introduces several optimizations: \itemize{ \item -#' matrixwise operations allow to estimate variance on several interest -#' variables at once \item Matrix::TsparseMatrix capability and the native -#' integration of stratification yield significant performance gains. \item -#' the ability to pre-calculate some time-consuming operations speeds up the -#' estimation at execution time. } \item \code{varDT} does not natively -#' implements the calibration estimator (i.e. the sampling variance estimator -#' that takes into account the effect of calibration). In the context of the -#' \code{gustave} package, \code{\link{rescal}} could be called before -#' \code{varDT} in order to achieve the same result.} -#' -#' -#' @return \itemize{ \item if \code{y} is not \code{NULL} (calculation step) : -#' the estimated variances as a numerical vector of size the number of -#' columns of y. \item if \code{y} is \code{NULL} (pre-calculation step) : a list -#' containing pre-calculated data: \itemize{ \item \code{pik}: the numerical vector -#' of first-order inclusion probabilities. \item \code{A}: the numerical matrix -#' denoted A in (Deville, Tillé, 2005). \item \code{ck}: the numerical vector denoted -#' ck2 in (Deville, Tillé, 2005). \item \code{inv}: the inverse of \code{A \%*\% -#' Matrix::Diagonal(x = ck) \%*\% t(A)} \item \code{diago}: the diagonal term -#' of the variance estimator } } -#' -#' @author Martin Chevalier -#' -#' @references Caron N., Deville J.-C., Sautory O. (1998), \emph{Estimation de -#' précision de données issues d'enquêtes : document méthodologique sur le -#' logiciel POULPE}, Insee working paper, n°9806 -#' -#' Deville, J.-C. (1993), \emph{Estimation de la variance pour les enquêtes en -#' deux phases}, Manuscript, INSEE, Paris. -#' -#' Deville, J.-C., Tillé, Y. (2005), "Variance approximation under balanced -#' sampling", \emph{Journal of Statistical Planning and Inference}, 128, issue -#' 2 569-591 -#' -#' Rao, J.N.K (1975), "Unbiased variance estimation for multistage designs", -#' \emph{Sankhya}, C n°37 -#' -#' @examples library(sampling) -#' set.seed(1) -#' -#' # Simple random sampling case -#' N <- 1000 -#' n <- 100 -#' y <- rnorm(N)[as.logical(srswor(n, N))] -#' pik <- rep(n/N, n) -#' varDT(y, pik) -#' sampling::varest(y, pik = pik) -#' N^2 * (1 - n/N) * var(y) / n -#' -#' # Unequal probability sampling case -#' N <- 1000 -#' n <- 100 -#' pik <- runif(N) -#' s <- as.logical(UPsystematic(pik)) -#' y <- rnorm(N)[s] -#' pik <- pik[s] -#' varDT(y, pik) -#' varest(y, pik = pik) -#' # The small difference is expected (see above). -#' -#' # Balanced sampling case -#' N <- 1000 -#' n <- 100 -#' pik <- runif(N) -#' x <- matrix(rnorm(N*3), ncol = 3) -#' s <- as.logical(samplecube(x, pik)) -#' y <- rnorm(N)[s] -#' pik <- pik[s] -#' x <- x[s, ] -#' varDT(y, pik, x) -#' -#' # Balanced sampling case (variable of interest -#' # among the balancing variables) -#' N <- 1000 -#' n <- 100 -#' pik <- runif(N) -#' y <- rnorm(N) -#' x <- cbind(matrix(rnorm(N*3), ncol = 3), y) -#' s <- as.logical(samplecube(x, pik)) -#' y <- y[s] -#' pik <- pik[s] -#' x <- x[s, ] -#' varDT(y, pik, x) -#' # As expected, the total of the variable of interest is perfectly estimated. -#' -#' # strata argument -#' n <- 100 -#' H <- 2 -#' pik <- runif(n) -#' y <- rnorm(n) -#' strata <- letters[sample.int(H, n, replace = TRUE)] -#' all.equal( -#' varDT(y, pik, strata = strata) -#' , varDT(y[strata == "a"], pik[strata == "a"]) + varDT(y[strata == "b"], pik[strata == "b"]) -#' ) -#' -#' # precalc argument -#' n <- 1000 -#' H <- 50 -#' pik <- runif(n) -#' y <- rnorm(n) -#' strata <- sample.int(H, n, replace = TRUE) -#' precalc <- varDT(y = NULL, pik, strata = strata) -#' identical( -#' varDT(y, precalc = precalc) -#' , varDT(y, pik, strata = strata) -#' ) -#' -#' @export - -varDT <- function(y = NULL, pik, x = NULL, strata = NULL, w = NULL, collinearity.check = NULL, precalc = NULL){ - - # set.seed(1); n <- 2600; q <- 10; p <- 15; H <- 22; y <- matrix(rnorm(q*n),ncol=q); pik <- runif(n); x <- matrix(rnorm(p*n),ncol=p); x <- cbind(x, x[, 1]); strata <- rep(1:H,n %/% H + 1)[1:n][sample.int(n)]; precalc <- NULL; - # y = NULL; pik = bisect$piup; x = up_x; strata = bisect$reg; collinearity.check = TRUE - - if(is.null(precalc)){ - - if(is.null(x)){ - x <- pik - if(is.null(collinearity.check)) collinearity.check <- FALSE - } - p <- NCOL(x) - - # Stratification - if(!is.null(strata)){ - t <- block_matrix(x, strata) - x <- t$y - bycol <- t$bycol - t <- table(strata) - n <- as.vector(t[match(strata, names(t))]) - }else{ - bycol <- rep(1, p) - n <- length(pik) - } - - # Checking for collinearity - if(isTRUE(collinearity.check) || (is.null(collinearity.check) && Matrix::det(t(x) %*% x) == 0)){ - t <- as.vector(is.na(stats::lm(rep(1, NROW(x)) ~ . - 1, data = as.data.frame(as.matrix(x)))$coef)) - t2 <- sumby(!t, bycol) - x <- x[, !t] - if(any(t)) warning("Some variables in x where discarted due to collinearity.") - p <- as.vector(t2[match(strata, names(t2))]) - } - - # A, ck and inv terms - A <- t(x / pik) - ck <- (1 - pik) * n / pmax(n - p, 1) - u <- A %*% Matrix::Diagonal(x = ck) %*% t(A) - inv <- methods::as(if(Matrix::det(u) != 0) solve(u) else MASS::ginv(as.matrix(u)),"TsparseMatrix") - - }else list2env(precalc, envir = environment()) - - if(is.null(y)){ - # Diagonal term of the variance estimator - diago <- ck * (1 - diag(t(A) %*% inv %*% A) * ck)/pik^2 - names(diago) <- names(pik) - return(list(pik = pik, A = A, ck = ck, inv = inv, diago = diago)) - }else{ - if(is.null(w)) w <- rep(1, length(pik)) - z <- y / pik - zhat <- t(A) %*% inv %*% (A %*% Matrix::Diagonal(x = ck) %*% z) - return(Matrix::colSums(ck * w * (z - zhat)^2)) - } - -} - - - -#' @rdname varDT -#' @export -var_srs <- function(y, pik, strata = NULL, w = NULL, precalc = NULL){ - if(any(tapply(pik, strata, stats::sd) > 1e-6)) - stop("First-order inclusion probabilities are not equal (within strata if any).") - varDT( - y = y, pik = pik, x = NULL, strata = strata, w = w, - collinearity.check = FALSE, precalc = precalc - ) -} - - - -#' Variance estimator for a Poisson sampling design -#' -#' @description \code{var_pois} estimates the variance of the estimator -#' of a total for a Poisson sampling design. -#' -#' @param y A numerical matrix of the variable(s) whose variance of their total -#' is to be estimated. May be a Matrix::TsparseMatrix. -#' @param pik A numerical vector of first-order inclusion probabilities. -#' @param w An optional numerical vector of row weights (see Details). -#' -#' @details \code{w} is a row weight used at the final summation step. It is useful -#' when \code{var_pois} is used on the second stage of a two-stage sampling -#' design applying the Rao (1975) formula. -#' -#' @return The estimated variances as a numerical vector of size the number of -#' columns of y. -#' -#' @author Martin Chevalier -#' -#' @references Rao, J.N.K (1975), "Unbiased variance estimation for multistage designs", -#' \emph{Sankhya}, C n°37 - -#' @export -var_pois <- function(y, pik, w = NULL){ - colSums(w * (1 - pik) * (y / pik)^2) -} - - -#' Sen-Yates-Grundy variance estimator -#' -#' @description \code{varSYG} computes the Sen-Yates-Grundy -#' variance estimator (valid under the assumption that the sampling -#' design is of fixed size). -#' -#' @param y A numerical matrix of the variable(s) whose variance of their total -#' is to be estimated. May be a Matrix::TsparseMatrix. -#' @param pikl A numerical matrix of second-order inclusion probabilities. -#' @param precalc A list of pre-calculated results (analogous to the one used by -#' \code{\link{varDT}}). -#' -#' @details \code{varSYG} aims at being an efficient implementation of the -#' Sen-Yates-Grundy variance estimator for sampling designs with fixed sample -#' size. It should be especially useful when several variance estimations are -#' to be conducted, as it relies on (sparse) matrix linear algebra. -#' -#' Moreover, in order to be consistent with \code{\link{varDT}}, \code{varSYG} -#' has a \code{precalc} argument allowing for the re-use of intermediary -#' results calculated once and for all in a pre-calculation step (see -#' \code{\link{varDT}} for details). -#' -#' @section Difference with \code{varHT} from package \code{sampling}: -#' -#' \code{varSYG} differs from \code{sampling::varHT} in several ways: -#' \itemize{ \item The formula implemented in \code{varSYG} is solely -#' the Sen-Yates-Grundy estimator, which is the one calculated -#' by \code{varHT} when method = 2. -#' \item \code{varSYG} introduces several optimizations: \itemize{ \item -#' matrixwise operations allow to estimate variance on several interest -#' variables at once \item Matrix::TsparseMatrix capability yields significant -#' performance gains.}} -#' -#' @return \itemize{ \item if \code{y} is not \code{NULL} (calculation step) : a -#' numerical vector of size the number of columns of y. \item if \code{y} is -#' \code{NULL} (pre-calculation step) : a list containing pre-calculated data -#' (analogous to the one used by \code{\link{varDT}}).} -#' -#' @author Martin Chevalier (Insee, French Statistical Institute) -#' -#' @examples library(sampling) -#' set.seed(1) -#' -#' # Simple random sampling case -#' N <- 1000 -#' n <- 100 -#' y <- rnorm(N)[as.logical(srswor(n, N))] -#' pikl <- matrix(rep((n*(n-1))/(N*(N-1)), n*n), nrow = n) -#' diag(pikl) <- rep(n/N, n) -#' varSYG(y, pikl) -#' sampling::varHT(y = y, pikl = pikl, method = 2) - -#' @export - -varSYG <- function (y = NULL, pikl, precalc = NULL){ - if(is.null(precalc)){ - pik = diag(pikl) - delta <- 1 - pik %*% t(pik)/pikl - }else list2env(precalc, envir = environment()) - if(is.null(y)){ - diago <- -(1/pik^2) * rowSums(delta - diag(x = diag(delta))) - names(diago) <- row.names(pikl) - return(list(pikl = pikl, pik = pik, delta = delta, diago = diago)) - }else{ - var <- colSums((y/pik) * (delta %*% (y/pik)) - delta %*% (y/pik)^2) - return(var) - } -} - -# TODO: add a varHT() estimator \ No newline at end of file + +#' Linear Regression Residuals Calculation +#' +#' @description \code{res_cal} calculates linear regression residuals in an +#' efficient way : handling several dependent variables at a time, using +#' Matrix::TsparseMatrix capabilities and allowing for pre-calculation of +#' the matrix inverse. +#' +#' @param y A (sparse) numerical matrix of dependent variable(s). +#' @param x A (sparse) numerical matrix of independent variable(s). +#' @param w An optional numerical vector of row weights. +#' @param by An optional categorical vector (factor or character) +#' when residuals calculation is to be conducted within by-groups +#' (see Details). +#' @param precalc A list of pre-calculated results (see Details). +#' @param id A vector of identifiers of the units used in the calculation. +#' Useful when \code{precalc = TRUE} in order to assess whether the ordering of the +#' \code{y} data matrix matches the one used at the precalculation step. +#' +#' @details In the context of the \code{gustave} package, linear +#' regression residual calculation is solely used to take into account +#' the effect of calibration on variance estimation. Independent variables +#' are therefore most likely to be the same from one variance estimation +#' to another, hence the inversion of the matrix +#' \code{t(x) \%*\% Diagonal(x = w) \%*\% x} can be done once and for all +#' at a pre-calculation step. +#' +#' The parameters \code{y} and \code{precalc} determine whether a list of +#' pre-calculated data should be used in order to speed up the regression +#' residuals computation at execution time: +#' \itemize{ +#' \item if \code{y} not \code{NULL} and \code{precalc} \code{NULL} : +#' on-the-fly calculation of the matrix inverse and the regression residuals +#' (no pre-calculation). +#' \item if \code{y} \code{NULL} and \code{precalc} \code{NULL} : +#' pre-calculation of the matrix inverse which is stored in a list of +#' pre-calculated data. +#' \item if \code{y} not \code{NULL} and \code{precalc} not \code{NULL} : +#' calculation of the regression residuals using the list of pre-calculated +#' data. +#' } +#' +#' The \code{by} parameter allows for calculation within by-groups : all +#' calculation are made separately for each by-group (when calibration was +#' conducted separately on several subsamples), but in an efficient way using +#' Matrix::TsparseMatrix capabilities (especially when the matrix inverse is +#' pre-calculated). +#' +#' +#' @return \itemize{ \item if \code{y} is not \code{NULL} (calculation step) : a +#' numerical matrix with same structure (regular base::matrix or +#' Matrix::TsparseMatrix) and dimensions as \code{y}. \item if \code{y} is +#' \code{NULL} (pre-calculation step) : a list containing pre-calculated data.} +#' +#' @author Martin Chevalier +#' +#' @examples # Generating random data +#' set.seed(1) +#' n <- 100 +#' H <- 5 +#' y <- matrix(rnorm(2*n), nrow = n) +#' x <- matrix(rnorm(10*n), nrow = n) +#' by <- letters[sample(1:H, n, replace = TRUE)] +#' +#' # Direct calculation +#' res_cal(y, x) +#' +#' # Calculation with pre-calculated data +#' precalc <- res_cal(y = NULL, x) +#' res_cal(y, precalc = precalc) +#' identical(res_cal(y, x), res_cal(y, precalc = precalc)) +#' +#' # Matrix::TsparseMatrix capability +#' require(Matrix) +#' X <- as(x, "TsparseMatrix") +#' Y <- as(y, "TsparseMatrix") +#' identical(res_cal(y, x), as.matrix(res_cal(Y, X))) +#' +#' # by parameter for within by-groups calculation +#' res_cal(Y, X, by = by) +#' identical( +#' res_cal(Y, X, by = by)[by == "a", ], +#' res_cal(Y[by == "a", ], X[by == "a", ]) +#' ) +#' +#' @export + +res_cal <- function(y = NULL, x, w = NULL, by = NULL, precalc = NULL, id = NULL){ + + # by <- NULL; w <- NULL + + if(is.null(precalc)){ + + if(is.null(w)) w <- rep(1, NROW(x)) + + # Taking the by into account + x <- coerce_to_TsparseMatrix(x) + if(!is.null(by)) x <- detect_block(x, by) %||% make_block(x, by) + + # Determine the inverse of the t(x) %*% x matrix while removing colinear variables + while(TRUE){ + u <- crossprod(x, Matrix::Diagonal(x = w) %*% x) + if(Matrix::rankMatrix(u, method = "qr") != NROW(u)){ + is_colinear <- as.vector(is.na(stats::lm.fit(x = as.matrix(u), y = rep(1, NROW(u)))$coef)) + if(any(is_colinear)) note("Some variables in x were discarded due to collinearity.") + x <- x[, !is_colinear, drop = FALSE] + }else break + } + inv <- solve(u) + + }else list2env(precalc, envir = environment()) + + if(is.null(y)) return(list(x = x, w = w, inv = inv)) else { + class_y <- class(y) + dimnames_y <- dimnames(y) + y <- coerce_to_TsparseMatrix(y) + if(!is.null(precalc) && !is.null(id) && !is.null(rownames(y)) && !identical(as.character(id), rownames(y))) stop( + "The names of the data matrix (y argument) do not match the reference id (id argument)." + ) + e <- y - x %*% ( inv %*% crossprod(x, Matrix::Diagonal(x = w) %*% y) ) + if(class(e) != class_y) e <- methods::as(e, class_y) + dimnames(e) <- dimnames_y + e + } + +} + +# n <- 70000; p <- 10; q <- 150; H <- 6; y <- matrix(rnorm(n*p),ncol=p); x <- Matrix(rnorm(n*q)*(runif(n*q) > 0.98),ncol=q); w <- runif(n); by <- rep(1:H,n %/% H + 1)[1:n][sample.int(n)]; +# precalc <- res_cal(y = NULL, x = x, w = w, by = by) +# microbenchmark(times = 10, res_cal(y, precalc = precalc), res_cal(y, x = x, w = w, by = by)) +# inv <- ginv(as.matrix(t(x * w) %*% x)) +# t2 <- res_calib(y,x,w,inv) +# identical(t,t2) +# microbenchmark(res_cal(y,x,w),res_cal(y,x,w,inv),times = 10) + +#' Variance approximation with Deville-Tillé (2005) formula +#' +#' @aliases varDT var_srs +#' +#' @description \code{varDT} estimates the variance of the estimator of a total +#' in the case of a balanced sampling design with equal or unequal probabilities +#' using Deville-Tillé (2005) formula. Without balancing variables, it falls back +#' to Deville's (1993) classical approximation. Without balancing variables and +#' with equal probabilities, it falls back to the classical Horvitz-Thompson +#' variance estimator for the total in the case of simple random sampling. +#' Stratification is natively supported. +#' +#' \code{var_srs} is a convenience wrapper for the (stratified) simple random +#' sampling case. +#' +#' @param y A (sparse) numerical matrix of the variable(s) whose variance of their total +#' is to be estimated. +#' @param pik A numerical vector of first-order inclusion probabilities. +#' @param x An optional (sparse) numerical matrix of balancing variable(s). +#' @param strata An optional categorical vector (factor or character) when +#' variance estimation is to be conducted within strata. +#' @param w An optional numerical vector of row weights (see Details). +#' @param precalc A list of pre-calculated results (see Details). +#' @param id A vector of identifiers of the units used in the calculation. +#' Useful when \code{precalc = TRUE} in order to assess whether the ordering of the +#' \code{y} data matrix matches the one used at the pre-calculation step. +#' +#' @details \code{varDT} aims at being the workhorse of most variance estimation conducted +#' with the \code{gustave} package. It may be used to estimate the variance +#' of the estimator of a total in the case of (stratified) simple random sampling, +#' (stratified) unequal probability sampling and (stratified) balanced sampling. +#' The native integration of stratification based on Matrix::TsparseMatrix allows +#' for significant performance gains compared to higher level vectorizations +#' (\code{*apply} especially). +#' +#' Several time-consuming operations (e.g. collinearity-check, matrix +#' inversion) can be pre-calculated in order to speed up the estimation at +#' execution time. This is determined by the value of the parameters \code{y} +#' and \code{precalc}: \itemize{ \item if \code{y} not \code{NULL} and +#' \code{precalc} \code{NULL} : on-the-fly calculation (no pre-calculation). +#' \item if \code{y} \code{NULL} and \code{precalc} \code{NULL} : +#' pre-calculation whose results are stored in a list of pre-calculated data. +#' \item if \code{y} not \code{NULL} and \code{precalc} not \code{NULL} : +#' calculation using the list of pre-calculated data. } +#' +#' \code{w} is a row weight used at the final summation step. It is useful +#' when \code{varDT} or \code{var_srs} are used on the second stage of a +#' two-stage sampling design applying the Rao (1975) formula. +#' +#' @section Difference with \code{varest} from package \code{sampling}: +#' +#' \code{varDT} differs from \code{sampling::varest} in several ways: +#' \itemize{ \item The formula implemented in \code{varDT} is more general and +#' encompasses balanced sampling. \item Even in its reduced +#' form (without balancing variables), the formula implemented in \code{varDT} +#' slightly differs from the one implemented in \code{sampling::varest}. +#' Caron (1998, pp. 178-179) compares the two estimators +#' (\code{sampling::varest} implements V_2, \code{varDT} implements V_1). +#' \item \code{varDT} introduces several optimizations: \itemize{ \item +#' matrixwise operations allow to estimate variance on several interest +#' variables at once \item Matrix::TsparseMatrix capability and the native +#' integration of stratification yield significant performance gains. \item +#' the ability to pre-calculate some time-consuming operations speeds up the +#' estimation at execution time. } \item \code{varDT} does not natively +#' implements the calibration estimator (i.e. the sampling variance estimator +#' that takes into account the effect of calibration). In the context of the +#' \code{gustave} package, \code{\link{res_cal}} should be called before +#' \code{varDT} in order to achieve the same result.} +#' +#' @seealso \code{\link{res_cal}} +#' +#' @return \itemize{ \item if \code{y} is not \code{NULL} (calculation step) : +#' the estimated variances as a numerical vector of size the number of +#' columns of y. \item if \code{y} is \code{NULL} (pre-calculation step) : a list +#' containing pre-calculated data.} +#' +#' @author Martin Chevalier +#' +#' @references Caron N. (1998), "Le logiciel Poulpe : aspects méthodologiques", \emph{Actes +#' des Journées de méthodologie statistique} \url{http://jms-insee.fr/jms1998s03_1/} +#' Deville, J.-C. (1993), \emph{Estimation de la variance pour les enquêtes en +#' deux phases}, Manuscript, INSEE, Paris. +#' +#' Deville, J.-C., Tillé, Y. (2005), "Variance approximation under balanced +#' sampling", \emph{Journal of Statistical Planning and Inference}, 128, issue +#' 2 569-591 +#' +#' Rao, J.N.K (1975), "Unbiased variance estimation for multistage designs", +#' \emph{Sankhya}, C n°37 +#' +#' @examples library(sampling) +#' set.seed(1) +#' +#' # Simple random sampling case +#' N <- 1000 +#' n <- 100 +#' y <- rnorm(N)[as.logical(srswor(n, N))] +#' pik <- rep(n/N, n) +#' varDT(y, pik) +#' sampling::varest(y, pik = pik) +#' N^2 * (1 - n/N) * var(y) / n +#' +#' # Unequal probability sampling case +#' N <- 1000 +#' n <- 100 +#' pik <- runif(N) +#' s <- as.logical(UPsystematic(pik)) +#' y <- rnorm(N)[s] +#' pik <- pik[s] +#' varDT(y, pik) +#' varest(y, pik = pik) +#' # The small difference is expected (see Details). +#' +#' # Balanced sampling case +#' N <- 1000 +#' n <- 100 +#' pik <- runif(N) +#' x <- matrix(rnorm(N*3), ncol = 3) +#' s <- as.logical(samplecube(x, pik)) +#' y <- rnorm(N)[s] +#' pik <- pik[s] +#' x <- x[s, ] +#' varDT(y, pik, x) +#' +#' # Balanced sampling case (variable of interest +#' # among the balancing variables) +#' N <- 1000 +#' n <- 100 +#' pik <- runif(N) +#' y <- rnorm(N) +#' x <- cbind(matrix(rnorm(N*3), ncol = 3), y) +#' s <- as.logical(samplecube(x, pik)) +#' y <- y[s] +#' pik <- pik[s] +#' x <- x[s, ] +#' varDT(y, pik, x) +#' # As expected, the total of the variable of interest is perfectly estimated. +#' +#' # strata argument +#' n <- 100 +#' H <- 2 +#' pik <- runif(n) +#' y <- rnorm(n) +#' strata <- letters[sample.int(H, n, replace = TRUE)] +#' all.equal( +#' varDT(y, pik, strata = strata), +#' varDT(y[strata == "a"], pik[strata == "a"]) + varDT(y[strata == "b"], pik[strata == "b"]) +#' ) +#' +#' # precalc argument +#' n <- 1000 +#' H <- 50 +#' pik <- runif(n) +#' y <- rnorm(n) +#' strata <- sample.int(H, n, replace = TRUE) +#' precalc <- varDT(y = NULL, pik, strata = strata) +#' identical( +#' varDT(y, precalc = precalc), +#' varDT(y, pik, strata = strata) +#' ) +#' +#' @export + +varDT <- function(y = NULL, pik, x = NULL, strata = NULL, w = NULL, precalc = NULL, id = NULL){ + + if(is.null(precalc)){ + + if(any(pik <= 0 | pik > 1)) stop("All pik must be in ]0;1]") + + exh <- pik == 1 + if(any(exh)) note( + sum(exh), " units are exhaustive (pik = 1). They are discarded from the variance estimation process." + ) + pik <- pik[!exh] + + x <- if(!is.null(x)) coerce_to_TsparseMatrix(x)[!exh, , drop = FALSE] else pik + + # Stratification + if(!is.null(strata)){ + strata <- droplevels(as.factor(strata[!exh])) + if(any(tapply(strata, strata, length) == 1, na.rm = TRUE)) + stop("Some strata contain less than 2 samples units.") + x <- detect_block(x, strata) %||% make_block(x, strata) + colby <- attr(x, "colby") + rowby <- attr(x, "rowby") + }else{ + colby <- rep("1", NCOL(x)) + rowby <- rep("1", NROW(x)) + } + + # Determine A, ck and inv terms while removing colinear variables + n <- as.vector(tapply(rowby, rowby, length)[rowby]) + A <- t(x) %*% Diagonal(x = 1 / pik) + while(TRUE){ + p <- as.vector(tapply(colby, colby, length)[rowby]) + ck <- (1 - pik) * n / pmax(n - p, 1) + u <- tcrossprod(A %*% Matrix::Diagonal(x = ck), A) + if(Matrix::rankMatrix(u, method = "qr") != NROW(u)){ + # TODO: See if tol = 1e-12 in rankMatrix does not solve some issues + is_colinear <- as.vector(is.na(stats::lm.fit(x = as.matrix(u), y = rep(1, NROW(u)))$coef)) + if(any(is_colinear)) note("Some variables in x were discarded due to collinearity.") + A <- A[!is_colinear, , drop = FALSE] + colby <- colby[!is_colinear] + }else break + } + inv <- solve(u) + + }else list2env(precalc, envir = environment()) + + if(is.null(y)){ + # Diagonal term of the variance estimator + diago <- ck * (1 - colSums(A * (inv %*% A)) * ck)/pik^2 + names(diago) <- names(pik) + if(!is.null(w)) stop("w is not to be included in the precalculated data.") + return(list(id = id, pik = pik, exh = exh, A = A, ck = ck, inv = inv, diago = diago)) + }else{ + y <- coerce_to_TsparseMatrix(y) + if(!is.null(precalc) && !is.null(id) && !is.null(rownames(y)) && !identical(as.character(id), rownames(y))) stop( + "The names of the data matrix (y argument) do not match the reference id (id argument)." + ) + if(is.null(w)) w <- rep(1, length(pik)) + z <- Diagonal(x = 1 / pik) %*% y[!exh, , drop = FALSE] + zhat <- t(A) %*% inv %*% (A %*% Matrix::Diagonal(x = ck) %*% z) + return(Matrix::colSums(ck * w * (z - zhat)^2)) + } + +} + + + +#' @rdname varDT +#' @export +var_srs <- function(y, pik, strata = NULL, w = NULL, precalc = NULL){ + if(is.null(precalc) && !is.null(strata) && any(tapply(pik, strata, stats::sd) > 1e-6, na.rm = TRUE)) + stop("First-order inclusion probabilities are not equal (within strata if any).") + varDT(y = y, pik = pik, x = NULL, strata = strata, w = w, precalc = precalc) +} + + + +#' Variance estimator for a Poisson sampling design +#' +#' @description \code{var_pois} estimates the variance of the estimator +#' of a total for a Poisson sampling design. +#' +#' @param y A (sparse) numerical matrix of the variable(s) whose variance of their total +#' is to be estimated. +#' @param pik A numerical vector of first-order inclusion probabilities. +#' @param w An optional numerical vector of row weights (see Details). +#' +#' @details \code{w} is a row weight used at the final summation step. It is useful +#' when \code{var_pois} is used on the second stage of a two-stage sampling +#' design applying the Rao (1975) formula. +#' +#' @return The estimated variances as a numerical vector of size the number of +#' columns of \code{y}. +#' +#' @author Martin Chevalier +#' +#' @references Rao, J.N.K (1975), "Unbiased variance estimation for multistage designs", +#' \emph{Sankhya}, C n°37 + +#' @export +var_pois <- function(y, pik, w = rep(1, length(pik))){ + colSums(w * (1 - pik) * (y / pik)^2) +} + + +#' Sen-Yates-Grundy variance estimator +#' +#' @description \code{varSYG} computes the Sen-Yates-Grundy +#' variance estimator. +#' +#' @param y A (sparse) numerical matrix of the variable(s) whose variance of their total +#' is to be estimated. +#' @param pikl A numerical matrix of second-order inclusion probabilities. +#' @param precalc A list of pre-calculated results (analogous to the one used by +#' \code{\link{varDT}}). +#' +#' @details \code{varSYG} aims at being an efficient implementation of the +#' Sen-Yates-Grundy variance estimator for sampling designs with fixed sample +#' size. It should be especially useful when several variance estimations are +#' to be conducted, as it relies on (sparse) matrix linear algebra. +#' +#' Moreover, in order to be consistent with \code{\link{varDT}}, \code{varSYG} +#' has a \code{precalc} argument allowing for the re-use of intermediary +#' results calculated once and for all in a pre-calculation step (see +#' \code{\link{varDT}} for details). +#' +#' @section Difference with \code{varHT} from package \code{sampling}: +#' +#' \code{varSYG} differs from \code{sampling::varHT} in several ways: +#' \itemize{ \item The formula implemented in \code{varSYG} is solely +#' the Sen-Yates-Grundy estimator, which is the one calculated +#' by \code{varHT} when method = 2. +#' \item \code{varSYG} introduces several optimizations: \itemize{ \item +#' matrixwise operations allow to estimate variance on several interest +#' variables at once \item Matrix::TsparseMatrix capability yields significant +#' performance gains.}} +#' +#' @return \itemize{ \item if \code{y} is not \code{NULL} (calculation step) : a +#' numerical vector of size the number of columns of y. \item if \code{y} is +#' \code{NULL} (pre-calculation step) : a list containing pre-calculated data +#' (analogous to the one used by \code{\link{varDT}}).} +#' +#' @author Martin Chevalier +#' +#' @examples library(sampling) +#' set.seed(1) +#' +#' # Simple random sampling case +#' N <- 1000 +#' n <- 100 +#' y <- rnorm(N)[as.logical(srswor(n, N))] +#' pikl <- matrix(rep((n*(n-1))/(N*(N-1)), n*n), nrow = n) +#' diag(pikl) <- rep(n/N, n) +#' varSYG(y, pikl) +#' sampling::varHT(y = y, pikl = pikl, method = 2) + +#' @export + +varSYG <- function (y = NULL, pikl, precalc = NULL){ + if(is.null(precalc)){ + pik = diag(pikl) + delta <- 1 - pik %*% t(pik)/pikl + }else list2env(precalc, envir = environment()) + if(is.null(y)){ + diago <- -(1/pik^2) * rowSums(delta - diag(x = diag(delta))) + names(diago) <- row.names(pikl) + return(list(pikl = pikl, pik = pik, delta = delta, diago = diago)) + }else{ + var <- colSums((y/pik) * (delta %*% (y/pik)) - delta %*% (y/pik)^2) + return(var) + } +} + +# TODO: Add a varHT() estimator \ No newline at end of file diff --git a/README.md b/README.md index 89733e3..5ebcb80 100644 --- a/README.md +++ b/README.md @@ -1,112 +1,168 @@ -gustave -======= - -Gustave (Gustave: a User-oriented Statistical Toolkit for Analytical Variance Estimation) is an R package that provides a **toolkit for analytical variance estimation in survey sampling**. Apart from the implementation of standard variance estimators (Sen-Yates-Grundy, Deville-Tillé), its main feature is to help the methodologist produce easy-to-use variance estimation "wrappers", where systematic operations (linearization, domain estimation) are handled in a consistent and transparent way for the end user. - -## Install - -As gustave is not yet available on CRAN, for now the simplest way to install it on your computer is to use `devtools::install_github()`: - -``` -install.packages("devtools") -devtools::install_github("martinchevalier/gustave") -``` - -## Example - -In this example, we define a variance estimation wrapper adapted to the example data inspired by the Information and communication technology (ICT) survey. The subset of the (simulated) ICT survey has the following features: - -- stratified one-stage sampling design of 650 firms; -- 612 responding firms, non-response correction through reweighting in homogeneous response groups based on economic sub-sector and turnover; -- calibration on margins (number of firms and turnover broken down by economic sub-sector). - -### Step 1: Define the variance *function* - -In this context, the variance estimation *function* can be defined as follows: - -``` -# Definition of the variance function -variance_function <- function(y){ - - # Calibration - y <- rescal(y, x = x) - - # Non-response - y <- add0(y, rownames = ict_sample$firm_id) - var_nr <- var_pois(y, pik = ict_sample$response_prob_est, w = ict_sample$w_sample) - - # Sampling - y <- y / ict_sample$response_prob_est - var_sampling <- var_srs(y, pik = 1 / ict_sample$w_sample, strata = ict_sample$division) - - var_sampling + var_nr - -} - -# With x the calibration variables matrix -library(gustave) -x <- as.matrix(ict_survey[ - order(ict_survey$firm_id), - c(paste0("N_", 58:63), paste0("turnover_", 58:63)) -]) - -# Test of the variance function -y <- as.matrix(ict_survey$speed_quanti) -rownames(y) <- ict_survey$firm_id -variance_function(y) -``` - - -### Step 2: Define the variance *wrapper* - -The next step is the definition of a variance *wrapper*, which is easier to use than the variance function: - -``` -variance_wrapper <- define_variance_wrapper( - variance_function = variance_function, - reference_id = ict_survey$firm_id, - default = list(id = "firm_id", weight = "w_calib"), - objects_to_include = c("x", "ict_sample") -) -``` - -**Note** The objects `x` and `ict_sample` are embedded within the function `variance_wrapper()` (`variance_wrapper` is a [closure](http://adv-r.had.co.nz/Functional-programming.html#closures)) - -### Step 3: Features of the variance wrapper - -``` -# Better display of results -variance_wrapper(ict_survey, speed_quanti) - -# Mean linearization -variance_wrapper(ict_survey, mean(speed_quanti)) -# Ratio linearization -variance_wrapper(ict_survey, ratio(turnover, employees)) - -# Discretization of qualitative variables -variance_wrapper(ict_survey, speed_quali) -# On-the-fly recoding -variance_wrapper(ict_survey, speed_quali == "Between 2 and 10 Mbs") - -# 1-domain estimation -variance_wrapper(ict_survey, speed_quanti, where = division == "58") -# Multiple domains estimation -variance_wrapper(ict_survey, speed_quanti, by = division) - -# Multiple variables at a time -variance_wrapper(ict_survey, speed_quanti, big_data) -variance_wrapper(ict_survey, speed_quanti, mean(big_data)) -# Flexible syntax for where and by arguments -# (similar to the aes() function in ggplot2) -variance_wrapper(ict_survey, where = division == "58", - mean(speed_quanti), mean(big_data * 100) -) -variance_wrapper(ict_survey, where = division == "58", - mean(speed_quanti), mean(big_data * 100, where = division == "61") -) -variance_wrapper(ict_survey, where = division == "58", - mean(speed_quanti), mean(big_data * 100, where = NULL) -) -``` - - +gustave +======= + +Gustave (Gustave: a User-oriented Statistical Toolkit for Analytical Variance Estimation) is an R package that provides a **toolkit for analytical variance estimation in survey sampling**. + +Apart from the implementation of standard variance estimators (Sen-Yates-Grundy, Deville-Tillé), its main feature is to **help he methodologist produce easy-to-use variance estimation *wrappers***, where systematic operations (statistic linearization, domain estimation) are handled in a consistent and transparent way. + +The **ready-to-use variance estimation wrapper `qvar()`**, adapted for common cases (e.g. stratified simple random sampling, non-response correction through reweighting in homogeneous response groups, calibration), is also included. The core functions of the package (e.g. `define_variance_wrapper()`) are to be used for more complex cases. + +## Install + +gustave is available on CRAN and can therefore be installed with the `install.packages()` function: + +``` +install.packages("gustave") +``` + +However, if you wish to install the latest version of gustave, you can use `devtools::install_github()` to install it directly from the [github.com repository](https://github.com/martinchevalier/gustave): + +``` +install.packages("devtools") +devtools::install_github("martinchevalier/gustave") +``` + +## Example + +In this example, we aim at estimating the variance of estimators computed using simulated data inspired from the Information and communication technology (ICT) survey. This survey has the following characteristics: + +- stratified one-stage sampling design; +- non-response correction through reweighting in homogeneous response groups based on economic sub-sector and turnover; +- calibration on margins (number of firms and turnover broken down by economic sub-sector). + +The ICT simulated data files are shipped with the gustave package: + +``` +library(gustave) +data(package = "gustave") +? ict_survey +``` + +### Methodological description of the survey + +A variance estimation can be perform in a single call of `qvar()`: +``` +qvar( + + # Sample file + data = ict_sample, + + # Dissemination and identification information + dissemination_dummy = "dissemination", + dissemination_weight = "w_calib", + id = "firm_id", + + # Scope + scope_dummy = "scope", + + # Sampling design + sampling_weight = "w_sample", + strata = "strata", + + # Non-response correction + nrc_weight = "w_nrc", + response_dummy = "resp", + hrg = "hrg", + + # Calibration + calibration_weight = "w_calib", + calibration_var = c(paste0("N_", 58:63), paste0("turnover_", 58:63)), + + # Statistic(s) and variable(s) of interest + mean(employees) + +) +``` + +The survey methodology description is however cumbersome when several variance estimations are to be conducted. As it does not change from one estimation to another, it could be defined once and for all and then re-used for all variance estimations. `qvar()` allows for this by defining a so-called variance *wrapper*, that is an easy-to-use function where the variance estimation methodology for the given survey is implemented and all the technical data used to do so included. + +``` +# Definition of the variance estimation wrapper precision_ict +precision_ict <- qvar( + + # As before + data = ict_sample, + dissemination_dummy = "dissemination", + dissemination_weight = "w_calib", + id = "firm_id", + scope_dummy = "scope", + sampling_weight = "w_sample", + strata = "strata", + nrc_weight = "w_nrc", + response_dummy = "resp", + hrg = "hrg", + calibration_weight = "w_calib", + calibration_var = c(paste0("N_", 58:63), paste0("turnover_", 58:63)), + + # Replacing the variables of interest by define = TRUE + define = TRUE + +) + +# Use of the variance estimation wrapper +precision_ict(ict_sample, mean(employees)) + +# The variance estimation wrapper can also be used on the survey file +precision_ict(ict_survey, mean(speed_quanti)) +``` + +### Features of the variance estimation wrapper + +The variance estimation *wrapper* is much easier-to-use than a standard variance estimation function: + +- several statistics in one call (with optional labels): + + ``` + precision_ict(ict_survey, + "Mean internet speed in Mbps" = mean(speed_quanti), + "Turnover per employee" = ratio(turnover, employees) + ) + ``` + +- domain estimation with where and by arguments + + ``` + precision_ict(ict_survey, + mean(speed_quanti), + where = employees >= 50 + ) + precision_ict(ict_survey, + mean(speed_quanti), + by = division + ) + + # Domain may differ from one estimator to another + precision_ict(ict_survey, + "Mean turnover, firms with 50 employees or more" = mean(turnover, where = employees >= 50), + "Mean turnover, firms with 100 employees or more" = mean(turnover, where = employees >= 100) + ) + ``` + +- handy variable evaluation + + ``` + # On-the-fly evaluation (e.g. discretization) + precision_ict(ict_survey, mean(speed_quanti > 100)) + + # Automatic discretization for qualitative (character or factor) variables + precision_ict(ict_survey, mean(speed_quali)) + + # Standard evaluation capabilities + variables_of_interest <- c("speed_quanti", "speed_quali") + precision_ict(ict_survey, mean(variables_of_interest)) + ``` + +- Integration with %>% and dplyr + + ``` + library(dplyr) + ict_survey %>% + precision_ict("Internet speed above 100 Mbps" = mean(speed_quanti > 100)) %>% + select(label, est, lower, upper) + ``` + +## Colophon + +This software is an [R](https://cran.r-project.org/) package developed with the [RStudio IDE](https://www.rstudio.com/) and the [devtools](https://CRAN.R-project.org/package=devtools), [roxygen2](https://CRAN.R-project.org/package=roxygen2) and [testthat](https://CRAN.R-project.org/package=testthat) packages. Much help was found in [R packages](http://r-pkgs.had.co.nz/) and [Advanced R](http://adv-r.had.co.nz/) both written by [Hadley Wickham](http://hadley.nz/). + +From the methodological point of view, this package is related to the [Poulpe SAS macro (in French)](http://jms-insee.fr/jms1998_programme/#1513415199356-a8a1bdde-becd) developed at the French statistical institute. From the implementation point of view, some inspiration was found in the [ggplot2](https://CRAN.R-project.org/package=ggplot2) package. The idea of developing an R package on this specific topic was stimulated by the [icarus](https://CRAN.R-project.org/package=icarus) package and its author. diff --git a/data/ict_pop.RData b/data/ict_pop.RData index 43a6e82..5ab982f 100644 Binary files a/data/ict_pop.RData and b/data/ict_pop.RData differ diff --git a/data/ict_sample.RData b/data/ict_sample.RData index f03920a..3693d5f 100644 Binary files a/data/ict_sample.RData and b/data/ict_sample.RData differ diff --git a/data/ict_survey.RData b/data/ict_survey.RData index 7b263cf..44038a6 100644 Binary files a/data/ict_survey.RData and b/data/ict_survey.RData differ diff --git a/data/lfs_samp_area.RData b/data/lfs_samp_area.RData new file mode 100644 index 0000000..8f9dcdd Binary files /dev/null and b/data/lfs_samp_area.RData differ diff --git a/data/lfs_samp_dwel.RData b/data/lfs_samp_dwel.RData new file mode 100644 index 0000000..45cca80 Binary files /dev/null and b/data/lfs_samp_dwel.RData differ diff --git a/data/lfs_samp_ind.RData b/data/lfs_samp_ind.RData new file mode 100644 index 0000000..c261ac5 Binary files /dev/null and b/data/lfs_samp_ind.RData differ diff --git a/man/add0.Rd b/man/add_zero.Rd similarity index 74% rename from man/add0.Rd rename to man/add_zero.Rd index 3e9617a..665ab36 100644 --- a/man/add0.Rd +++ b/man/add_zero.Rd @@ -1,52 +1,52 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils.R -\name{add0} -\alias{add0} -\title{Expand a matrix or a data.frame with zeros based on rownames matching} -\usage{ -add0(y, rownames, remove = TRUE) -} -\arguments{ -\item{y}{A (sparse) matrix or a data.frame. The object to add zeros to.} - -\item{rownames}{A character vector (other types are coerced to character). -The character vector giving the rows of the produced object.} - -\item{remove}{Should rows of y whose name do not appear in the rownames -argument be removed ? TRUE by default, a warning is shown when rows are -removed.} -} -\value{ -A (sparse) matrix or data.frame depending on the type of y. -} -\description{ -For a given two-dimensional object with rownames and a character - vector, \code{add0()} produces a corresponding object whose rownames match - the character vector, with zeros on the additional rows. - - This function is an easy-to-use and reliable way to reintroduce - non-responding units in the variance estimation process (after the - non-response phase is taken into account). -} -\examples{ -# Data generation -set.seed(1) -n <- 10 -p <- 2 -y <- matrix(1:(n*p), ncol = p, dimnames = list(sample(letters, n))) -y[c(3, 8, 12)] <- NA -rownames <- letters - -# Standard use -add0(y, rownames) - -# Use when rownames in y do not match -# any element in the rownames argument -rownames(y)[1:3] <- toupper(rownames(y)[1:3]) -add0(y, rownames) -add0(y, rownames, remove = FALSE) - -} -\author{ -Martin Chevalier -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{add_zero} +\alias{add_zero} +\title{Expand a matrix or a data.frame with zeros based on rownames matching} +\usage{ +add_zero(y, rownames, remove = TRUE) +} +\arguments{ +\item{y}{A (sparse) matrix or a data.frame. The object to add zeros to.} + +\item{rownames}{A character vector (other types are coerced to character). +The character vector giving the rows of the produced object.} + +\item{remove}{Should rows of \code{y} whose name do not appear in the rownames +argument be removed ? TRUE by default, a warning is shown when rows are +removed.} +} +\value{ +A (sparse) matrix or data.frame depending on the type of \code{y}. +} +\description{ +For a given two-dimensional object with rownames and a character + vector, \code{add_zero} produces a corresponding object whose rownames match + the character vector, with zeros on the additional rows. + + This function is an easy-to-use and reliable way to reintroduce + non-responding units in the variance estimation process (after the + non-response phase is taken into account). +} +\examples{ +# Data generation +set.seed(1) +n <- 10 +p <- 2 +y <- matrix(1:(n*p), ncol = p, dimnames = list(sample(letters, n))) +y[c(3, 8, 12)] <- NA +rownames <- letters + +# Standard use +add_zero(y, rownames) + +# Use when rownames in y do not match +# any element in the rownames argument +rownames(y)[1:3] <- toupper(rownames(y)[1:3]) +add_zero(y, rownames) +add_zero(y, rownames, remove = FALSE) + +} +\author{ +Martin Chevalier +} diff --git a/man/define_linearization_wrapper.Rd b/man/define_linearization_wrapper.Rd deleted file mode 100644 index 6f5d8fe..0000000 --- a/man/define_linearization_wrapper.Rd +++ /dev/null @@ -1,149 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/define_linearization_wrapper.R -\name{define_linearization_wrapper} -\alias{define_linearization_wrapper} -\title{Define a linearization wrapper} -\usage{ -define_linearization_wrapper(linearization_function, arg_type, - allow_factor = FALSE, arg_not_affected_by_domain = NULL, - display_function = standard_display) -} -\arguments{ -\item{linearization_function}{An R function with input the quantities -used in the linearization formula and with output a list with two -named element: \itemize{\item \code{lin}: a list of numerical vectors (most -of the time, only 1) which correspond to the value of the linearized -variable \item \code{metadata}: a list of metadata to be used by the display -function (see \code{display_function} argument), including (for the -standard display function) \code{est} for the point-estimate and -\code{n} for the number of observations used in the estimation.}} - -\item{arg_type}{A named list with three character vectors describing -the type of each argument of \code{linearization_function}: \itemize{ -\item \code{data}: data argument(s), numerical vector(s) to be used in the -linearization formula \item \code{weight}: weight argument, numerical vector -to be used as row weights in the linearization formula \item \code{param}: -parameters, non-data arguments (most of the time boolean) to be used to -control some aspect of the linearization formula}} - -\item{allow_factor}{A logical vector of length 1 (\code{FALSE} by default) -indicating whether factor variable are accepted as-is by the linearization -wrappers. This should be the case when the linearization function only has -one data argument (e.g. \code{total} or \code{mean} linearization formulae).} - -\item{arg_not_affected_by_domain}{A character vector indicating the (data) -arguments which should not be affected by domain-splitting. Such parameters -may appear in some complex linearization formula, for instance when the -At-Risk of Poverty Rate (ARPR) is estimated by region but with a poverty -line calculated at the national level.} - -\item{display_function}{An R function which produces, for each variance -estimation, the data.frame row to be displayed by the variance estimation -wrapper. It uses three arguments: -\itemize{\item \code{var} the estimated variance \item \code{metadata} the -metadata associated with the estimation, especially the one outputted by -\code{linearization_function} (e.g. \code{est}, \code{n}) \item \code{alpha} -the level for the construction of confidence intervals (at execution time, -its value is taken from the \code{alpha} argument of the variance wrapper.)} -The default display function (\code{standard_display}) uses standard metadata -to display usual variance indicator (variance, standard deviation, coefficient -of variation, confidence interval) broken down by linearization wrapper, domain -(if any) and level (if the variable is a factor, see argument \code{allow_factor}).} -} -\value{ -A function to be used within a variance estimation wrapper to perform - a specific linearization (see examples). Its formals are the ones of - \code{linearization_function} with the addition of \code{by} and \code{where} - (for domain estimation, set to \code{NULL} by default). -} -\description{ -Given a linearization \emph{function} (specific to an - estimator), \code{define_linearization_wrapper} defines a - linearization \emph{wrapper} to be used together with - \code{\link[=define_variance_wrapper]{variance estimation wrappers}} - in order to make variance estimation easier. - This function is intended for \strong{advanced use only} (see Details), - standard linearization wrappers are included in the gustave package (see - \code{\link[=linearization_wrapper_standard]{standard linearization wrappers}}) -} -\details{ -When the estimator is not the estimator of a total, the application of - analytical variance estimation formulae developed for the estimator of a total - is not straightforward (Deville, 1999). An asymptotically unbiased variance - estimator can nonetheless be obtained if the estimation of variance is performed - on a variable obtained from the original data through a linearization step. - - \code{define_linearization_wrapper} is the function used to create, given - a linearization \emph{function} implementing a given linearization - \emph{formula}, a linearization \emph{wrapper} which can be used together - with a variance wrapper. - - Linearization wrappers are quite flexible tools - to apply a variance function to an estimator requiring a linearization step - (e.g. all estimators except the estimator of a total) with virtually no - additional complexity for the end-user. To some extent, linearization wrappers - can be seen as ggplot2 \code{geom_} and \code{stat_} functions: they help - the end-user in writing down what he or she wants without having to go too - deep into the details of the corresponding layers. - - \code{\link[=linearization_wrapper_standard]{standard linearization wrappers}} - are included within the gustave package and automatically added - to the variance estimation wrappers. New linearization wrappers can be defined - using the \code{define_linearization_wrapper} and then explicitly added to the - variance estimation wrappers using the \code{objects_to_include} argument. -} -\examples{ -### Example from the Information and communication technologies (ICT) survey - -# The subset of the (simulated) ICT survey has the following features: -# - stratified one-stage sampling design of 650 firms; -# - 612 responding firms, non-response correction through reweighting -# in homogeneous response groups based on economic sub-sector and turnover; -# - calibration on margins (number of firms and turnover broken down -# by economic sub-sector). - -# Step 1 : Dummy variance wrapper -# Note : see define_variance_wrapper() for a more -# realistic variance function and examples. -variance_wrapper <- define_variance_wrapper( - variance_function = function(y) abs(colSums(y)), - reference_id = ict_survey$firm_id, - default = list(id = "firm_id", weight = "w_calib") -) -variance_wrapper(ict_survey, total(speed_quanti)) - -# Step 2 : Redefine the mean linearization wrapper -# The mean() linearization wrapper defined in the gustave -# package is bulit on top of the ratio() linearization wrapper. -variance_wrapper(ict_survey, mean(speed_quanti)) - -# Let's redefine it directly from the formula found for instance -# in (Caron, Deville, Sautory, 1998) and without handling NA -# values -mean2 <- define_linearization_wrapper( - linearization_function = function(y, weight){ - est <- sum(y * weight) / sum(weight) - lin <- (y - est) / sum(weight) - list( - lin = list(lin), - metadata = list(est = est, n = length(y)) - ) - }, - arg_type = list(data = "y", weight = "weight"), - allow_factor = TRUE -) -variance_wrapper(ict_survey, mean(speed_quanti), mean2(speed_quanti)) - -} -\references{ -Deville J.-C. (1999), "Variance estimation for complex statistics and - estimators: linearization and residual techniques", \emph{Survey Methodology}, - 25:193–203 -} -\seealso{ -\code{\link[=linearization_wrapper_standard]{standard linearization wrappers}} - \code{\link{define_variance_wrapper}} -} -\author{ -Martin Chevalier -} diff --git a/man/define_statistic_wrapper.Rd b/man/define_statistic_wrapper.Rd new file mode 100644 index 0000000..e8faee1 --- /dev/null +++ b/man/define_statistic_wrapper.Rd @@ -0,0 +1,143 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/define_statistic_wrapper.R +\name{define_statistic_wrapper} +\alias{define_statistic_wrapper} +\title{Define a statistic wrapper} +\usage{ +define_statistic_wrapper(statistic_function, arg_type, + arg_not_affected_by_domain = NULL, + display_function = standard_display) +} +\arguments{ +\item{statistic_function}{An R function specific to the statistic to +calculate. It should produce at least the point estimator and the +linearized variable associated with the statistic (see Details).} + +\item{arg_type}{A named list with three character vectors describing +the type of each argument of \code{statistic_function} (see Details).} + +\item{arg_not_affected_by_domain}{A character vector indicating the +arguments which should not be affected by domain-splitting. Such parameters +may appear in some complex linearization formula, for instance when the +At-Risk of Poverty Rate (ARPR) is estimated by region but with a poverty +line calculated at the national level.} + +\item{display_function}{An R function which produces, for each variance +estimation, the data.frame to be displayed by the variance estimation +wrapper. The default display function (\code{standard_display}) uses +standard metadata to display usual variance indicator (point estimate, +variance, standard deviation, coefficient of variation, confidence interval) +broken down by statistic wrapper, domain (if any) and level (if the variable +is a factor).} +} +\value{ +A function to be used within a variance estimation wrapper to estimate + a specific statistic (see examples). Its formals are the ones of + \code{statistic_function} with the addition of \code{by} and \code{where} + (for domain estimation, set to \code{NULL} by default). +} +\description{ +\code{define_statistic_wrapper} defines + statistic \emph{wrappers} to be used together with + \code{\link[=define_variance_wrapper]{variance estimation wrappers}}. + A statistic wrapper produces both the point estimator and the + linearized variable associated with a given statistic to estimate + variance on (Deville, 1999). \code{define_statistic_wrapper} is intended + for \strong{advanced use only}, standard statistic wrappers are included + in the gustave package (see \code{\link[=standard_statistic_wrapper]{standard statistic wrappers}}) +} +\details{ +When the statistic to estimate is not a total, the application of + analytical variance estimation formulae developed for the estimator of a total + is not straightforward (Deville, 1999). An asymptotically unbiased variance + estimator can nonetheless be obtained if the estimation of variance is performed + on a variable obtained from the original data through a linearization step. + + \code{define_statistic_wrapper} is the function used to create, for a + given statistic, an easy-to-use function which calculates both the point + estimator and the linearized variable associated with the statistic. These + operations are implemented by the \code{statistic_function}, which can have + any needed input (for example \code{num} and \code{denom} for a ratio + estimator) and should output a list with at least two named elements: \itemize{ + \item \code{point}: the point estimator of the statistic + \item \code{lin}: the linearized variable to be passed on to the variance + estimation formula. If several variables are to be associated with + the statistics, \code{lin} can be a list itself. + } + All other named elements in the output of \code{define_statistic_wrapper} are + treated as metadata (that may be used later on by \code{display_function}). + + \code{arg_type} is a named list of three elements that describes the nature + of the argument of \code{statistic_function}: \itemize{ + \item \code{data}: data argument(s), numerical vector(s) to be used + to calculate the point estimate and the linearized variable associated + with the statistic + \item \code{weight}: weight argument, numerical vector to be used + as row weights + \item \code{param}: parameters, non-data arguments to be used to + control some aspect of the computation} + + Statistic wrappers are quite flexible tools to apply a variance function + to an estimator requiring a linearization step (e.g. all estimators except + the estimator of a total) with virtually no additional complexity for the + end-user. + + \code{\link[=standard_statistic_wrapper]{standard statistic wrappers}} + are included within the gustave package and automatically added + to the variance estimation wrappers. New statistic wrappers can be defined + using the \code{define_statistic_wrapper} and then explicitly added to the + variance estimation wrappers using the \code{objects_to_include} argument. + + Note: To some extent, statistic wrappers can be seen as ggplot2 + \code{geom_} and \code{stat_} functions: they help the end-user in writing + down what he or she wants without having to go too deep into the details + of the corresponding layers. +} +\examples{ +### Example from the Information and communication technologies (ICT) survey + +# Let's define a variance wrapper asfor the ICT survey +# as in the examples of the qvar function: +precision_ict <- qvar( + data = ict_sample, + dissemination_dummy = "dissemination", + dissemination_weight = "w_calib", + id = "firm_id", + scope_dummy = "scope", + sampling_weight = "w_sample", + strata = "strata", + nrc_weight = "w_nrc", + response_dummy = "resp", + hrg = "hrg", + calibration_weight = "w_calib", + calibration_var = c(paste0("N_", 58:63), paste0("turnover_", 58:63)), + define = TRUE +) +precision_ict(ict_survey, mean(speed_quanti)) + +# Let's now redefine the mean statistic wrapper +mean2 <- define_statistic_wrapper( + statistic_function = function(y, weight){ + point <- sum(y * weight) / sum(weight) + lin <- (y - point) / sum(weight) + list(point = point, lin = lin, metadata = list(n = length(y))) + }, + arg_type = list(data = "y", weight = "weight") +) + +# mean2 can now be used inside precision_ict (and yields +# the same results as the mean statistic wrapper) +precision_ict(ict_survey, mean(speed_quanti), mean2(speed_quanti)) + +} +\references{ +Deville J.-C. (1999), "Variance estimation for complex statistics and + estimators: linearization and residual techniques", \emph{Survey Methodology}, + 25:193–203 +} +\seealso{ +\code{\link[=standard_statistic_wrapper]{standard statistic wrappers}}, \code{\link{define_variance_wrapper}} +} +\author{ +Martin Chevalier +} diff --git a/man/define_variance_wrapper.Rd b/man/define_variance_wrapper.Rd index f152e2d..2d8a8c3 100644 --- a/man/define_variance_wrapper.Rd +++ b/man/define_variance_wrapper.Rd @@ -1,194 +1,208 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/define_variance_wrapper.R -\name{define_variance_wrapper} -\alias{define_variance_wrapper} -\title{Define a variance estimation wrapper} -\usage{ -define_variance_wrapper(variance_function, reference_id, default = list(stat = - "total", alpha = 0.05), objects_to_include = NULL, - objects_to_include_from = parent.frame()) -} -\arguments{ -\item{variance_function}{An R function, with input a data matrix and possibly -other arguments (e.g. parameters affecting the estimation of variance), -and output a numeric vector of estimated variances (or a list whose first -element is a numeric vector of estimated variances).} - -\item{reference_id}{A vector containing the ids of all the responding units -of the survey. It is compared with \code{default$id} to check whether some -observations are missing in the survey file. Observations are reordered -according to \code{reference_id}.} - -\item{default}{a named list specifying the default values for: \itemize{ - \item \code{id}: the name of the default identifying variable in the survey - file. It can also be an unevaluated expression (enclosed in \code{substitute()}) to be - evaluated within the survey file. - \item \code{weight}: the name of the default weight variable in the survey file. - It can also be an unevaluated expression (enclosed in \code{substitute()}) to be - evaluated within the survey file. - \item \code{stat}: the name of the default statistic to compute when none is specified. - It is set to \code{"total"} by default. - \item \code{alpha}: the default threshold for confidence interval derivation. - It is set to \code{0.05} by default. -}} - -\item{objects_to_include}{A character vector indicating the name of -additional R objects to include within the variance wrapper. These objects -are to be used to carry out the variance estimation.} - -\item{objects_to_include_from}{The environment to which the additional R -objects belong.} -} -\value{ -An R function that makes the estimation of variance based on the provided -variance function easier. Its parameters are: - \itemize{ - \item \code{data}: the survey data where the interest variables are stored - \item \code{...}: one or more calls to a linearization wrapper (see examples - and \code{\link[=linearization_wrapper_standard]{standard linearization wrappers}}) - \item \code{where}: a logical vector indicating a domain on which the variance - estimation is conducted - \item \code{by}: a qualitative variable whose levels are used to define domains - on which the variance estimation is conducted - \item \code{stat}: a character vector of size 1 indicating the linearization - wrapper to use when none is specified. Its default value depends on - the value of \code{default_stat} in \code{define_variance_wrapper} - \item \code{alpha}: a numeric vector of size 1 indicating the threshold - for confidence interval derivation. Its default value depends on - the value of \code{default_alpha} in \code{define_variance_wrapper} - \item \code{id}: a character vector of size 1 containing the name of - the identifying variable in the survey file. It can also be an - unevaluated expression (using \code{substitute()}) to be evaluated within - the survey file. Its default value depends on the value of - \code{default_id} in \code{define_variance_wrapper} - \item \code{envir}: an environment containing a binding to \code{data} - } -} -\description{ -Given a variance estimation \emph{function} (specific to a - survey), \code{define_variance_wrapper} defines a variance estimation - \emph{wrapper} easier to use (e.g. automatic domain estimation, - linearization). -} -\details{ -Defining variance estimation wrappers is the \strong{key feature} of - the \code{gustave} package. - - Analytical variance estimation is often difficult to carry out by - non-specialists owing to the complexity of the underlying sampling - and estimation methodology. This complexity yields complex \emph{variance estimation - functions} which are most often only used by the sampling expert who - actually wrote them. A \emph{variance estimation wrapper} is an - intermediate function that is "wrapped around" the (complex) variance - estimation function in order to provide the non-specialist with - user-friendly features: \itemize{ \item checks for consistency between the - provided dataset and the survey characteristics \item factor discretization - \item domain estimation \item linearization of complex statistics (see - \code{\link[=linearization_wrapper_standard]{standard linearization wrappers}})} - - \code{define_variance_wrapper} allows the sampling expert to define a - variance estimation wrapper around a given variance estimation function and - set its default parameters. The produced variance estimation wrapper will - be stand-alone in the sense that it can contain additional data which would - \code{objects_to_include} and \code{objects_to_include_from} parameters). -} -\examples{ -### Example from the Information and communication technologies (ICT) survey - -# The subset of the (simulated) ICT survey has the following features: -# - stratified one-stage sampling design of 650 firms; -# - 612 responding firms, non-response correction through reweighting -# in homogeneous response groups based on economic sub-sector and turnover; -# - calibration on margins (number of firms and turnover broken down -# by economic sub-sector). - -# Step 1 : Definition of a variance function - -variance_function <- function(y){ - - # Calibration - y <- rescal(y, x = x) - - # Non-response - y <- add0(y, rownames = ict_sample$firm_id) - var_nr <- var_pois(y, pik = ict_sample$response_prob_est, w = ict_sample$w_sample) - - # Sampling - y <- y / ict_sample$response_prob_est - var_sampling <- var_srs(y, pik = 1 / ict_sample$w_sample, strata = ict_sample$division) - - var_sampling + var_nr - -} - -# With x the calibration variables matrix -x <- as.matrix(ict_survey[ - order(ict_survey$firm_id), - c(paste0("N_", 58:63), paste0("turnover_", 58:63)) -]) - -# Test of the variance function -y <- as.matrix(ict_survey$speed_quanti) -rownames(y) <- ict_survey$firm_id -variance_function(y) - -# Step 2 : Definition of a variance wrapper - -variance_wrapper <- define_variance_wrapper( - variance_function = variance_function, - reference_id = ict_survey$firm_id, - default = list(id = "firm_id", weight = "w_calib"), - objects_to_include = c("x", "ict_sample") -) - -# The objects "x" and "ict_sample" are embedded -# within the function variance_wrapper -ls(environment(variance_wrapper)) -# Note : variance_wrapper is a closure -# (http://adv-r.had.co.nz/Functional-programming.html#closures) -# As a consequence, the variance wrapper will work even if -# x is removed from globalenv() -rm(x) - -# Step 3 : Features of the variance wrapper - -# Better display of results -variance_wrapper(ict_survey, speed_quanti) - -# Mean linearization -variance_wrapper(ict_survey, mean(speed_quanti)) -# Ratio linearization -variance_wrapper(ict_survey, ratio(turnover, employees)) - -# Discretization of qualitative variables -variance_wrapper(ict_survey, speed_quali) -# On-the-fly recoding -variance_wrapper(ict_survey, speed_quali == "Between 2 and 10 Mbs") - -# 1-domain estimation -variance_wrapper(ict_survey, speed_quanti, where = division == "58") -# Multiple domains estimation -variance_wrapper(ict_survey, speed_quanti, by = division) - -# Multiple variables at a time -variance_wrapper(ict_survey, speed_quanti, big_data) -variance_wrapper(ict_survey, speed_quanti, mean(big_data)) -# Flexible syntax for where and by arguments -# (similar to the aes() function in ggplot2) -variance_wrapper(ict_survey, where = division == "58", - mean(speed_quanti), mean(big_data * 100) -) -variance_wrapper(ict_survey, where = division == "58", - mean(speed_quanti), mean(big_data * 100, where = division == "61") -) -variance_wrapper(ict_survey, where = division == "58", - mean(speed_quanti), mean(big_data * 100, where = NULL) -) - -} -\seealso{ -\code{\link[=linearization_wrapper_standard]{standard linearization wrappers}} \code{\link{varDT}} -} -\author{ -Martin Chevalier -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/define_variance_wrapper.R +\name{define_variance_wrapper} +\alias{define_variance_wrapper} +\title{Define a variance estimation wrapper} +\usage{ +define_variance_wrapper(variance_function, reference_id, reference_weight, + default_id = NULL, technical_data = NULL, technical_param = NULL, + objects_to_include = NULL) +} +\arguments{ +\item{variance_function}{An R function. It is the methodological workhorse of +the variance estimation: from a set of arguments including the variables +of interest (see below), it should return a vector of estimated variances. +See Details.} + +\item{reference_id}{A vector containing the ids of all the responding units +of the survey. It can also be an unevaluated expression (enclosed in +\code{quote()}) to be evaluated within the execution environment of the wrapper. +It is compared with \code{default$id} (see below) to check whether +some observations are missing in the survey file. The matrix of variables +of interest passed on to \code{variance_function} has \code{reference_id} +as rownames and is ordered according to its values.} + +\item{reference_weight}{A vector containing the reference weight of the survey. +It can also be an unevaluated expression (enclosed in \code{quote()}) to +be evaluated within the execution environment of the wrapper.} + +\item{default_id}{A character vector of length 1, the name of the default +identifying variable in the survey file. It can also be an unevaluated +expression (enclosed in \code{quote()}) to be evaluated within the survey file.} + +\item{technical_data}{A named list of technical data needed to perform +the variance estimation (e.g. sampling strata, first- or second-order +probabilities of inclusion, estimated response probabilities, calibration +variables). Its names should match the names of the corresponding arguments +in \code{variance_function}.} + +\item{technical_param}{A named list of technical parameters used to control +some aspect of the variance estimation process (e.g. alternative methodology). +Its names should match the names of the corresponding arguments in \code{variance_function}.} + +\item{objects_to_include}{(Advanced use) A character vector indicating the name of +additional R objects to include within the variance wrapper.} +} +\value{ +An R function that makes the estimation of variance based on the + provided variance function easier. Its parameters are: \itemize{ \item + \code{data}: one or more calls to a statistic wrapper (e.g. \code{total()}, + \code{mean()}, \code{ratio()}). See examples and + \code{\link[=standard_statistic_wrapper]{standard statistic wrappers}}) and + \code{\link[=standard_statistic_wrapper]{standard statistic wrappers}}) + \item \code{where}: a logical vector indicating a domain on which the + variance estimation is to be performed \item \code{by}: q qualitative + variable whose levels are used to define domains on which the variance + estimation is performed \item \code{alpha}: a numeric vector of length 1 + indicating the threshold for confidence interval derivation (\code{0.05} by + default) \item \code{display}: a logical verctor of length 1 indicating + whether the result of the estimation should be displayed or not \item + \code{id}: a character vector of size 1 containing the name of the + identifying variable in the survey file. Its default value depends on the + value of \code{default_id} in \code{define_variance_wrapper} \item + \code{envir}: an environment containing a binding to \code{data}} +} +\description{ +Given a variance estimation \emph{function} (specific to a + survey), \code{define_variance_wrapper} defines a variance estimation + \emph{wrapper} easier to use (e.g. automatic domain estimation, + linearization). +} +\details{ +Defining variance estimation wrappers is the \strong{key feature} of + the \code{gustave} package. It is the workhorse of the ready-to-use + \code{\link{qvar}} function and should be used directly to handle more complex + cases (e.g. surveys with several stages or balanced sampling). + + Analytical variance estimation is often difficult to carry out by + non-specialists owing to the complexity of the underlying sampling + and estimation methodology. This complexity yields complex \emph{variance + estimation functions} which are most often only used by the sampling expert + who actually wrote them. A \emph{variance estimation wrapper} is an + intermediate function that is "wrapped around" the (complex) variance + estimation function in order to provide the non-specialist with + user-friendly features (see examples): \itemize{ + \item calculation of complex statistics (see + \code{\link[=standard_statistic_wrapper]{standard statistic wrappers}}) + \item domain estimation + \item handy evaluation and factor discretization + } + + \code{define_variance_wrapper} allows the sampling expert to define a + variance estimation wrapper around a given variance estimation function and + set its default parameters. The produced variance estimation wrapper is + standalone in the sense that it contains all technical data necessary + to carry out the estimation (see \code{technical_data}). + + The arguments of the \code{variance_function} fall into three types: \itemize{ + \item the data argument (mandatory, only one allowed): the numerical matrix of + variables of interest to apply the variance estimation formula on + \item technical data arguments (optional, one or more allowed): technical + and methodological information used by the variance estimation function + (e.g. sampling strata, first- or second-order probabilities of inclusion, + estimated response probabilities, calibration variables) + \item technical parameters (optional, one or more allowed): non-data arguments + to be used to control some aspect of the variance estimation (e.g. alternative + methodology)} + + \code{technical_data} and \code{technical_param} are used to determine + which arguments of \code{variance_function} relate to technical information, + the only remaining argument is considered as the data argument. +} +\examples{ +### Example from the Labour force survey (LFS) + +# The (simulated) Labour force survey (LFS) has the following characteristics: +# - first sampling stage: balanced sampling of 4 areas (each corresponding to +# about 120 dwellings) on first-order probability of inclusion (proportional to +# the number of dwellings in the area) and total annual income in the area. +# - second sampling stage: in each sampled area, simple random sampling of 20 +# dwellings +# - neither non-response nor calibration + +# As this is a multi-stage sampling design with balanced sampling at the first +# stage, the qvar function does not apply. A variance wrapper can nonetheless +# be defined using the core define_variance_wrapper function. + +# Step 1 : Definition of the variance function and the corresponding technical data + +# In this context, the variance estimation function specific to the LFS +# survey can be defined as follows: + +var_lfs <- function(y, ind, dwel, area){ + + variance <- list() + + # Variance associated with the sampling of the dwellings + y <- sum_by(y, ind$id_dwel) + variance[["dwel"]] <- var_srs( + y = y, pik = dwel$pik_dwel, strata = dwel$id_area, + w = (1 / dwel$pik_area^2 - dwel$q_area) + ) + + # Variance associated with the sampling of the areas + y <- sum_by(y = y, by = dwel$id_area, w = 1 / dwel$pik_dwel) + variance[["area"]] <- varDT(y = y, precalc = area) + + Reduce(`+`, variance) + +} + +# where y is the matrix of variables of interest and ind, dwel and area the technical data: + +technical_data_lfs <- list() + +# Technical data at the area level +# The varDT function allows for the pre-calculation of +# most of the methodological quantities needed. +technical_data_lfs$area <- varDT( + y = NULL, + pik = lfs_samp_area$pik_area, + x = as.matrix(lfs_samp_area[c("pik_area", "income")]), + id = lfs_samp_area$id_area +) + +# Technical data at the dwelling level +# In order to implement Rao (1975) formula for two-stage samples, +# we associate each dwelling with the diagonal term corresponding +# to its area in the first-stage variance estimator: +lfs_samp_dwel$q_area <- with(technical_data_lfs$area, setNames(diago, id))[lfs_samp_dwel$id_area] +technical_data_lfs$dwel <- lfs_samp_dwel[c("id_dwel", "pik_dwel", "id_area", "pik_area", "q_area")] + +# Technical data at the individual level +technical_data_lfs$ind <- lfs_samp_ind[c("id_ind", "id_dwel", "sampling_weight")] + +# Test of the variance function var_lfs +y <- matrix(as.numeric(lfs_samp_ind$unemp), ncol = 1, dimnames = list(lfs_samp_ind$id_ind)) +with(technical_data_lfs, var_lfs(y = y, ind = ind, dwel = dwel, area = area)) + + +# Step 2 : Definition of the variance wrapper + +# Call of define_variance_wrapper +precision_lfs <- define_variance_wrapper( + variance_function = var_lfs, + technical_data = technical_data_lfs, + reference_id = technical_data_lfs$ind$id_ind, + reference_weight = technical_data_lfs$ind$sampling_weight, + default_id = "id_ind" +) + +# Test +precision_lfs(lfs_samp_ind, unemp) + +# The variance wrapper precision_lfs has the same features +# as variance wrappers produced by the qvar function (see +# qvar examples for more details). + +} +\references{ +Rao, J.N.K (1975), "Unbiased variance estimation for multistage designs", + \emph{Sankhya}, C n°37 +} +\seealso{ +\code{\link{qvar}}, \code{\link[=standard_statistic_wrapper]{standard statistic wrappers}}, \code{\link{varDT}} +} +\author{ +Martin Chevalier +} diff --git a/man/ict_pop.Rd b/man/ict_pop.Rd index cb403ae..430e993 100644 --- a/man/ict_pop.Rd +++ b/man/ict_pop.Rd @@ -1,27 +1,28 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/data.R -\docType{data} -\name{ict_pop} -\alias{ict_pop} -\title{Sampling frame of the Information and communication technologies (ICT) -survey in the information and communication sector (NACE rev 2 J section)} -\format{A data frame with 7670 observations and 4 variables: -\describe{ - \item{firm_id}{identifier of the firm} - \item{division}{identifier of the economic sub-sector} - \item{employees}{number of employees} - \item{turnover}{firm turnover, in thousand euros} -}} -\usage{ -ict_pop -} -\description{ -A (simulated) dataset containing basic identification information and -auxiliary variables for the sampling of the Information and communication -technologies (ICT) survey in the information and communication sector -(NACE rev 2 J section). -} -\seealso{ -\code{\link{ict_sample}} \code{\link{ict_survey}} -} -\keyword{datasets} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{ict_pop} +\alias{ict_pop} +\title{Sampling frame of the Information and communication technologies (ICT) +survey} +\format{A data frame with 7670 observations and 5 variables: +\describe{ + \item{firm_id}{identifier of the firm} + \item{division}{identifier of the economic sub-sector} + \item{employees}{number of employees} + \item{turnover}{firm turnover, in thousand euros} + \item{strata}{stratification variable} +}} +\usage{ +ict_pop +} +\description{ +A (simulated) dataset containing basic identification information and +auxiliary variables for the sampling of the Information and communication +technologies (ICT) survey in the information and communication sector +(NACE rev 2 J section). +} +\seealso{ +\code{\link{qvar}}, \code{\link{ict_sample}}, \code{\link{ict_survey}} +} +\keyword{datasets} diff --git a/man/ict_sample.Rd b/man/ict_sample.Rd index 225c5d2..130102a 100644 --- a/man/ict_sample.Rd +++ b/man/ict_sample.Rd @@ -1,31 +1,38 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/data.R -\docType{data} -\name{ict_sample} -\alias{ict_sample} -\title{Sample of the Information and communication technologies (ICT) -survey in the information and communication sector (NACE rev 2 J section)} -\format{A data frame with 650 observations and 8 variables: -\describe{ - \item{firm_id}{identifier of the firm} - \item{division}{identifier of the economic sub-sector} - \item{employees}{number of employees} - \item{turnover}{firm turnover, in euros} - \item{w_sample}{sampling weight} - \item{resp}{boolean indicating whether the firm did respond to the survey or not} - \item{hrg}{homogeneous response group used for the unit non-response correction} - \item{response_prob_est}{response probability of the unit estimated using homogeneous response groups} - \item{w_nr}{weight after unit non-response correction} -}} -\usage{ -ict_sample -} -\description{ -A (simulated) dataset containing sampling information about the sample -of the Information and communication technologies (ICT) -survey in the information and communication sector (NACE rev 2 J section) -} -\seealso{ -\code{\link{ict_pop}} \code{\link{ict_survey}} -} -\keyword{datasets} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{ict_sample} +\alias{ict_sample} +\title{Sample of the Information and communication technologies (ICT) +survey} +\format{A data frame with 650 observations and 8 variables: +\describe{ + \item{firm_id}{identifier of the firm} + \item{division}{identifier of the economic sub-sector} + \item{employees}{number of employees} + \item{turnover}{firm turnover, in euros} + \item{strata}{stratification variable} + \item{w_sample}{sampling weight} + \item{scope}{boolean indicating whether the firm did belong to the scope of the survey or not} + \item{resp}{boolean indicating whether the firm did respond to the survey or not} + \item{nrc}{boolean indicating whether the firm did take part in the non-response correction process or not} + \item{hrg}{homogeneous response group used for the non-response correction} + \item{response_prob_est}{response probability of the unit estimated using homogeneous response groups} + \item{w_nrc}{weight after unit non-response correction} + \item{calib}{boolean indicating whether the firm was integrated in the calibration process or not (\code{TRUE} for all responding units)} + \item{N_58, N_59, N_60, N_61, N_62, N_63, turnover_58, turnover_59, turnover_60, turnover_61, turnover_62, turnover_63}{calibration variables (number of firms and turnover broken down by economic sub-sector)} + \item{w_calib}{calibrated weight} + \item{dissemination}{boolean indicating whether the unit appears in the dissemination file} +}} +\usage{ +ict_sample +} +\description{ +A (simulated) dataset containing sampling information about the sample +of the Information and communication technologies (ICT) +survey in the information and communication sector (NACE rev 2 J section) +} +\seealso{ +\code{\link{qvar}}, \code{\link{ict_pop}}, \code{\link{ict_survey}} +} +\keyword{datasets} diff --git a/man/ict_survey.Rd b/man/ict_survey.Rd index 80478ec..dfb461c 100644 --- a/man/ict_survey.Rd +++ b/man/ict_survey.Rd @@ -1,33 +1,30 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/data.R -\docType{data} -\name{ict_survey} -\alias{ict_survey} -\title{Survey data of the Information and communication technologies (ICT) -survey in the information and communication sector (NACE rev 2 J section)} -\format{A data frame with 612 observations and 25 variables: -\describe{ - \item{firm_id}{identifier of the firm} - \item{division}{identifier of the economic sub-sector} - \item{employees}{number of employees} - \item{turnover}{firm turnover, in euros} - \item{w_sample}{sampling weight} - \item{w_nr}{weight after unit non-response correction} - \item{N_58, N_59, N_60, N_61, N_62, N_63, turnover_58, turnover_59, turnover_60, turnover_61, turnover_62, turnover_63}{calibration variables (number of firms and turnover broken down by economic sub-sector)} - \item{w_calib}{calibrated weight} - \item{speed_quanti, speed_quanti_NA}{internet connection speed of the firm in Mbits, without or with missing values} - \item{speed_quali, speed_quali_NA}{internet connection speed of the firm recoded in classes, without or with missing values} - \item{big_data, big_data_NA}{use of big data analytics within the firm, without or with missing values} -}} -\usage{ -ict_survey -} -\description{ -A (simulated) dataset containing calibration and survey variables of the respondents -to the Information and communication technologies (ICT) -survey in the information and communication sector (NACE rev 2 J section) -} -\seealso{ -\code{\link{ict_pop}} \code{\link{ict_sample}} -} -\keyword{datasets} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{ict_survey} +\alias{ict_survey} +\title{Survey data of the Information and communication technologies (ICT) +survey} +\format{A data frame with 612 observations and 11 variables: +\describe{ + \item{firm_id}{identifier of the firm} + \item{division}{identifier of the economic sub-sector} + \item{employees}{number of employees} + \item{turnover}{firm turnover, in euros} + \item{w_calib}{calibrated weight} + \item{speed_quanti, speed_quanti_NA}{internet connection speed of the firm in Mbps, without or with missing values} + \item{speed_quali, speed_quali_NA}{internet connection speed of the firm recoded in classes, without or with missing values} + \item{big_data, big_data_NA}{use of big data analytics within the firm, without or with missing values} +}} +\usage{ +ict_survey +} +\description{ +A (simulated) dataset containing calibration and survey variables of the respondents +to the Information and communication technologies (ICT) +survey in the information and communication sector (NACE rev 2 J section) +} +\seealso{ +\code{\link{qvar}}, \code{\link{ict_pop}}, \code{\link{ict_sample}} +} +\keyword{datasets} diff --git a/man/lfs_samp_area.Rd b/man/lfs_samp_area.Rd new file mode 100644 index 0000000..f2c3c72 --- /dev/null +++ b/man/lfs_samp_area.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{lfs_samp_area} +\alias{lfs_samp_area} +\title{Sample of areas in the Labour force survey} +\format{A data frame with 4 observations and 3 variables: +\describe{ + \item{id_area}{identifier of the area} + \item{income}{total annual income of the area in thousand euros (from income registry)} + \item{pik_area}{first-order inclusion probability of the area (proportional to the number of dwellings in the area)} +}} +\usage{ +lfs_samp_area +} +\description{ +A (simulated) dataset containing information about 4 geographical +areas (about 120 dwellings each) sampled for the labour force survey. +} +\seealso{ +\code{\link{define_variance_wrapper}}, \code{\link{lfs_samp_dwel}}, \code{\link{lfs_samp_ind}} +} +\keyword{datasets} diff --git a/man/lfs_samp_dwel.Rd b/man/lfs_samp_dwel.Rd new file mode 100644 index 0000000..bdb250e --- /dev/null +++ b/man/lfs_samp_dwel.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{lfs_samp_dwel} +\alias{lfs_samp_dwel} +\title{Sample of dwellings in the Labour force survey} +\format{A data frame with 80 observations and 6 variables: +\describe{ + \item{id_dwel}{identifier of the dwelling} + \item{id_area}{identifier of the area} + \item{income}{total annual income of the dwelling in thousand euros (from income registry)} + \item{pik_area}{first-order inclusion probability of the area} + \item{pik_dwel}{first-order inclusion probability of the dwelling within the area (20 dwelling sampled per area)} + \item{pik}{first-order inclusion probability of the dwelling} +}} +\usage{ +lfs_samp_dwel +} +\description{ +A (simulated) dataset containing information about 80 dwellings +sampled for the Labour force survey (in the 4 areas described +in \code{\link{lfs_samp_area}}). +} +\seealso{ +\code{\link{define_variance_wrapper}}, \code{\link{lfs_samp_area}}, \code{\link{lfs_samp_ind}} +} +\keyword{datasets} diff --git a/man/lfs_samp_ind.Rd b/man/lfs_samp_ind.Rd new file mode 100644 index 0000000..dedcbcb --- /dev/null +++ b/man/lfs_samp_ind.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{lfs_samp_ind} +\alias{lfs_samp_ind} +\title{Sample of individuals in the Labour force survey} +\format{A data frame with 157 observations and 5 variables: +\describe{ + \item{id_ind}{identifier of the individual} + \item{id_dwel}{identifier of the dwelling} + \item{income}{total annual income of the individual in thousand euros (from income registry)} + \item{unemp}{unemployment status} + \item{sampling_weight}{sampling weight of the individual (inverse of the first-order inclusion probability of the dwelling)} +}} +\usage{ +lfs_samp_ind +} +\description{ +A (simulated) dataset containing information about 157 individuals +sampled for the Labour force survey (all members of the 80 dwellings +described in \code{\link{lfs_samp_dwel}}). It also contains the +unemployment status extracted from the survey file (no non-response). +} +\seealso{ +\code{\link{define_variance_wrapper}}, \code{\link{lfs_samp_area}}, \code{\link{lfs_samp_dwel}} +} +\keyword{datasets} diff --git a/man/qvar.Rd b/man/qvar.Rd new file mode 100644 index 0000000..0971cce --- /dev/null +++ b/man/qvar.Rd @@ -0,0 +1,266 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/qvar.R +\name{qvar} +\alias{qvar} +\title{Quickly perform a variance estimation in common cases} +\usage{ +qvar(data, ..., by = NULL, where = NULL, alpha = 0.05, + display = TRUE, id, dissemination_dummy, dissemination_weight, + sampling_weight, strata = NULL, scope_dummy = NULL, + nrc_weight = NULL, response_dummy = NULL, nrc_dummy = NULL, + calibration_weight = NULL, calibration_dummy = NULL, + calibration_var = NULL, define = FALSE, envir = parent.frame()) +} +\arguments{ +\item{data}{The \code{data.frame} containing all the technical information +required to prepare the variance estimation process (see other arguments +below). Note that this file should contain all the units sampled, +including the out-of-scope and non-responding units. If a variance +estimation is to be performed right away (when \code{define = FALSE}), +it should also contain the variables of interest.} + +\item{...}{One or more calls to a statistic wrapper (e.g. \code{total()}, +\code{mean()}, \code{ratio()}). See examples and +\code{\link[=standard_statistic_wrapper]{standard statistic wrappers}}} + +\item{by}{A qualitative variable whose levels are used to define domains +on which the variance estimation is performed.} + +\item{where}{A logical vector indicating a domain on which the variance +estimation is to be performed.} + +\item{alpha}{A numeric vector of length 1 indicating the threshold +for confidence interval derivation (\code{0.05} by default).} + +\item{display}{A logical verctor of length 1 indicating whether +the result of the estimation should be displayed or not.} + +\item{id}{The identification variable of the units in \code{data}. +It should be unique for each row in \code{data} and not contain any +missing values.} + +\item{dissemination_dummy}{A character vector of length 1, the name +of the logical variable in \code{data} indicating whether the unit +does appear in the disseminated file and should be used for point +estimates. It should not contain any missing values.} + +\item{dissemination_weight}{A character vector of length 1, the name +of the numerical variable in \code{data} corresponding to the +dissemination weight of the survey. It should not contain any missing +values.} + +\item{sampling_weight}{A character vector of length 1, the name of the +numeric variable in \code{data} corresponding to the sampling weights +of the survey. It should not contain any missing values.} + +\item{strata}{A character vector of length 1, the name of the factor +variable in \code{data} whose level match the stratification +used in the survey. Character variables are coerced to factor. +If defined, it should not contain any missing value. If \code{NULL}, +the variance estimation process does not take any stratification +into account.} + +\item{scope_dummy}{A character vector of length 1, the name of the logical +variable in \code{data} indicating whether the unit belongs to the +scope of the survey or not. Numerical variables are coerced to logical. +If defined, it should not contain any missing value. If \code{NULL}, +all units are supposed to be within the scope of the survey.} + +\item{nrc_weight}{A character vector of length 1, the name of the +numerical variable in \code{data} corresponding to the weights +after non-response correction. If defined, all responding units +should have a non-missing value. If \code{NULL}, all +units are supposed to be responding and the variance estimation +process does not include a second phase in order to take non-response +into account.} + +\item{response_dummy}{A character vector of length 1, the name of of the logical +variable in \code{data} indicating whether the unit is a responding +unit or not. Numerical variables are coerced to logical. \code{response_dummy} +should be defined as long as a \code{nrc_weight} is provided. All units +in the scope of the survey should have a non-missing value.} + +\item{nrc_dummy}{A character vector of length 1, the name of +the logical variable in \code{data} indicating whether the +units did take part in the non-response correction process. +All units in the scope of the survey should have a non-missing +value.} + +\item{calibration_weight}{A character vector of length 1, the name of the +numerical variable in \code{data} corresponding to the calibrated +weights. If defined, all responding units should have +a non-missing value. If \code{NULL}, the variance estimation +process does not take any calibration step into account.} + +\item{calibration_dummy}{A character vector of length 1, the name of of the logical +variable in \code{data} indicating whether the unit did take part +in the calibration process or not. Numerical variables are coerced to +logical. If defined, all responding units should have a non-missing +value. If \code{NULL}, calibration is supposed to have been performed +on all responding units.} + +\item{calibration_var}{A character vector, the name of the variable(s) used in +the calibration process. Logical variables are coerced to numeric. +Character and factor variables are automatically discretized. +\code{calibration_var} should be defined as long as a \code{calibration_weight} is +provided. All units taking part in the calibration process should have +only non-missing values for all variables in \code{calibration_var}.} + +\item{define}{Logical vector of lentgh 1. Should a variance wrapper +be defined instead of performing a variance estimation (see details +and examples)?} + +\item{envir}{An environment containing a binding to \code{data}.} +} +\description{ +\code{qvar} (for "quick variance estimation") is a function + performing analytical variance estimation in most common cases, that is: + \itemize{\item stratified simple random sampling \item non-response + correction (if any) through reweighting \item calibration (if any)} + +Used with \code{define = TRUE}, it defines a so-called variance wrapper, that +is a standalone ready-to-use function that can be applied to the survey dataset +without having to specify the methodological characteristics of the survey +(see \code{\link{define_variance_wrapper}}). +} +\details{ +\code{qvar} performs not only technical but also + methodological checks in order to ensure that the standard variance + estimation methodology does apply (e.g. equal probability of + inclusion within strata, number of units per stratum). + + Used with \code{define = TRUE}, the function returns a variance + estimation \emph{wrapper}, that is a ready-to-use function that + implements the described variance estimation methodology and + contains all necessary data to do so (see examples). + + Note: To some extent, \code{qvar} is analogous to the \code{qplot} function + in the ggplot2 package, as it is an easier-to-use function for common + cases. More complex cases are to be handled by using the core functions of + the gustave package, e.g. \code{\link{define_variance_wrapper}}. +} +\examples{ +### Example from the Information and communication technologies (ICT) survey + +# The (simulated) Information and communication technologies (ICT) survey +# has the following characteristics: +# - stratified one-stage sampling design +# - non-response correction through reweighting in homogeneous response groups +# - calibration on margins. + +# The ict_survey data.frame is a (simulated) subset of the ICT +# survey file containing the variables of interest for the 612 +# responding firms. + +# The ict_sample data.frame is the (simulated) sample of 650 +# firms corresponding to the ict_survey file. It contains all +# technical information necessary to estimate a variance with +# the qvar() function. + +## Methodological description of the survey + +# Direct call of qvar() +qvar( + + # Sample file + data = ict_sample, + + # Dissemination and identification information + dissemination_dummy = "dissemination", + dissemination_weight = "w_calib", + id = "firm_id", + + # Scope + scope_dummy = "scope", + + # Sampling design + sampling_weight = "w_sample", + strata = "strata", + + # Non-response correction + nrc_weight = "w_nrc", + response_dummy = "resp", + hrg = "hrg", + + # Calibration + calibration_weight = "w_calib", + calibration_var = c(paste0("N_", 58:63), paste0("turnover_", 58:63)), + + # Statistic(s) and variable(s) of interest + mean(employees) + +) + +# Definition of a variance estimation wrapper +precision_ict <- qvar( + + # As before + data = ict_sample, + dissemination_dummy = "dissemination", + dissemination_weight = "w_calib", + id = "firm_id", + scope_dummy = "scope", + sampling_weight = "w_sample", + strata = "strata", + nrc_weight = "w_nrc", + response_dummy = "resp", + hrg = "hrg", + calibration_weight = "w_calib", + calibration_var = c(paste0("N_", 58:63), paste0("turnover_", 58:63)), + + # Replacing the variables of interest by define = TRUE + define = TRUE + +) + +# Use of the variance estimation wrapper +precision_ict(ict_sample, mean(employees)) + +# The variance estimation wrapper can also be used on the survey file +precision_ict(ict_survey, mean(speed_quanti)) + +## Features of the variance estimation wrapper + +# Several statistics in one call (with optional labels) +precision_ict(ict_survey, + "Mean internet speed in Mbps" = mean(speed_quanti), + "Turnover per employee" = ratio(turnover, employees) +) + +# Domain estimation with where and by arguments +precision_ict(ict_survey, + mean(speed_quanti), + where = employees >= 50 +) +precision_ict(ict_survey, + mean(speed_quanti), + by = division +) + +# Domain may differ from one estimator to another +precision_ict(ict_survey, + "Mean turnover, firms with 50 employees or more" = mean(turnover, where = employees >= 50), + "Mean turnover, firms with 100 employees or more" = mean(turnover, where = employees >= 100) +) + +# On-the-fly evaluation (e.g. discretization) +precision_ict(ict_survey, mean(speed_quanti > 100)) + +# Automatic discretization for qualitative (character or factor) variables +precision_ict(ict_survey, mean(speed_quali)) + +# Standard evaluation capabilities +variables_of_interest <- c("speed_quanti", "speed_quali") +precision_ict(ict_survey, mean(variables_of_interest)) + +# Integration with \%>\% and dplyr +library(magrittr) +library(dplyr) +ict_survey \%>\% + precision_ict("Internet speed above 100 Mbps" = mean(speed_quanti > 100)) \%>\% + select(label, est, lower, upper) + +} +\seealso{ +\code{\link{define_variance_wrapper}}, \code{\link{standard_statistic_wrapper}} +} diff --git a/man/rescal.Rd b/man/res_cal.Rd similarity index 66% rename from man/rescal.Rd rename to man/res_cal.Rd index c0f7f30..dab6ee1 100644 --- a/man/rescal.Rd +++ b/man/res_cal.Rd @@ -1,115 +1,102 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/variance_function.R -\name{rescal} -\alias{rescal} -\title{Linear Regression Residuals Calculation} -\usage{ -rescal(y = NULL, x, w = NULL, by = NULL, collinearity.check = NULL, - precalc = NULL) -} -\arguments{ -\item{y}{A numerical matrix of dependent variable(s). May be a -Matrix::TsparseMatrix.} - -\item{x}{A numerical matrix of independent variable(s). May be a -Matrix::TsparseMatrix.} - -\item{w}{An optional numerical vector of row weights.} - -\item{by}{An optional categorical vector (factor or character) -when residuals calculation is to be conducted within by-groups -(see Details).} - -\item{collinearity.check}{A boolean (\code{TRUE} or \code{FALSE}) or -\code{NULL} indicating whether to perform a check for collinearity or -not (see Details).} - -\item{precalc}{A list of pre-calculated results (see Details).} -} -\value{ -\itemize{ \item if \code{y} is not \code{NULL} (calculation step) : a - numerical matrix with same structure (regular base::matrix or - Matrix::TsparseMatrix) and dimensions as \code{y}. \item if \code{y} is - \code{NULL} (pre-calculation step) : a list containing pre-calculated data: - \itemize{ \item \code{x}: the numerical matrix of independent variables. - \item \code{w}: the numerical vector of row weights (vector of 1 by - default). \item \code{inv}: the inverse of \code{t(x) \%*\% - Matrix::Diagonal(x = w) \%*\% x} } } -} -\description{ -\code{rescal} calculates linear regression residuals in an -efficient way : handling several dependent variables at a time, using -Matrix::TsparseMatrix capabilities and allowing for pre-calculation of -the matrix inverse. -} -\details{ -In the context of the \code{gustave} package, linear -regression residual calculation is solely used to take into account -the effect of calibration on variance estimation. Independent variables -are therefore most likely to be the same from one variance estimation -to another, hence the inversion of the matrix -\code{t(x) \%*\% Diagonal(x = w) \%*\% x} can be done once and for all -at a pre-calculation step. - -The parameters \code{y} and \code{precalc} determine whether a list of -pre-calculated data should be used in order to speed up the regression -residuals computation at execution time: -\itemize{ - \item if \code{y} not \code{NULL} and \code{precalc} \code{NULL} : - on-the-fly calculation of the matrix inverse and the regression residuals - (no pre-calculation). - \item if \code{y} \code{NULL} and \code{precalc} \code{NULL} : - pre-calculation of the matrix inverse which is stored in a list of - pre-calculated data. - \item if \code{y} not \code{NULL} and \code{precalc} not \code{NULL} : - calculation of the regression residuals using the list of pre-calculated - data. -} - -The \code{by} parameter allows for calculation within by-groups : all -calculation are made separately for each by-group (when calibration was -conducted separately on several subsamples), but in an efficient way using -Matrix::TsparseMatrix capabilities (especially when the matrix inverse is -pre-calculated). - -If \code{collinearity.check} is \code{NULL}, a test for collinearity in the -independent variables (\code{x}) is conducted if and only if \code{det(t(x) -\%*\% x) == 0}. -} -\examples{ -# Generating random data -set.seed(1) -n <- 100 -H <- 5 -y <- matrix(rnorm(2*n), nrow = n) -x <- matrix(rnorm(10*n), nrow = n) -by <- letters[sample(1:H, n, replace = TRUE)] - -# Direct calculation -rescal(y, x) - -# Calculation with pre-calculated data -precalc <- rescal(y = NULL, x) -rescal(y, precalc = precalc) -identical(rescal(y, x), rescal(y, precalc = precalc)) - -# Collinearity check -rescal(y, cbind(x, x[, 1]), collinearity.check = TRUE) - -# Matrix::TsparseMatrix capability -require(Matrix) -X <- as(x, "TsparseMatrix") -Y <- as(y, "TsparseMatrix") -rescal(Y, X) - -# by parameter for within by-groups calculation -rescal(Y, X, by = by) -identical( - rescal(Y, X, by = by)[by == "a", ] - , rescal(Y[by == "a", ], X[by == "a", ]) -) - -} -\author{ -Martin Chevalier -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/variance_function.R +\name{res_cal} +\alias{res_cal} +\title{Linear Regression Residuals Calculation} +\usage{ +res_cal(y = NULL, x, w = NULL, by = NULL, precalc = NULL, + id = NULL) +} +\arguments{ +\item{y}{A (sparse) numerical matrix of dependent variable(s).} + +\item{x}{A (sparse) numerical matrix of independent variable(s).} + +\item{w}{An optional numerical vector of row weights.} + +\item{by}{An optional categorical vector (factor or character) +when residuals calculation is to be conducted within by-groups +(see Details).} + +\item{precalc}{A list of pre-calculated results (see Details).} + +\item{id}{A vector of identifiers of the units used in the calculation. +Useful when \code{precalc = TRUE} in order to assess whether the ordering of the +\code{y} data matrix matches the one used at the precalculation step.} +} +\value{ +\itemize{ \item if \code{y} is not \code{NULL} (calculation step) : a + numerical matrix with same structure (regular base::matrix or + Matrix::TsparseMatrix) and dimensions as \code{y}. \item if \code{y} is + \code{NULL} (pre-calculation step) : a list containing pre-calculated data.} +} +\description{ +\code{res_cal} calculates linear regression residuals in an +efficient way : handling several dependent variables at a time, using +Matrix::TsparseMatrix capabilities and allowing for pre-calculation of +the matrix inverse. +} +\details{ +In the context of the \code{gustave} package, linear +regression residual calculation is solely used to take into account +the effect of calibration on variance estimation. Independent variables +are therefore most likely to be the same from one variance estimation +to another, hence the inversion of the matrix +\code{t(x) \%*\% Diagonal(x = w) \%*\% x} can be done once and for all +at a pre-calculation step. + +The parameters \code{y} and \code{precalc} determine whether a list of +pre-calculated data should be used in order to speed up the regression +residuals computation at execution time: +\itemize{ + \item if \code{y} not \code{NULL} and \code{precalc} \code{NULL} : + on-the-fly calculation of the matrix inverse and the regression residuals + (no pre-calculation). + \item if \code{y} \code{NULL} and \code{precalc} \code{NULL} : + pre-calculation of the matrix inverse which is stored in a list of + pre-calculated data. + \item if \code{y} not \code{NULL} and \code{precalc} not \code{NULL} : + calculation of the regression residuals using the list of pre-calculated + data. +} + +The \code{by} parameter allows for calculation within by-groups : all +calculation are made separately for each by-group (when calibration was +conducted separately on several subsamples), but in an efficient way using +Matrix::TsparseMatrix capabilities (especially when the matrix inverse is +pre-calculated). +} +\examples{ +# Generating random data +set.seed(1) +n <- 100 +H <- 5 +y <- matrix(rnorm(2*n), nrow = n) +x <- matrix(rnorm(10*n), nrow = n) +by <- letters[sample(1:H, n, replace = TRUE)] + +# Direct calculation +res_cal(y, x) + +# Calculation with pre-calculated data +precalc <- res_cal(y = NULL, x) +res_cal(y, precalc = precalc) +identical(res_cal(y, x), res_cal(y, precalc = precalc)) + +# Matrix::TsparseMatrix capability +require(Matrix) +X <- as(x, "TsparseMatrix") +Y <- as(y, "TsparseMatrix") +identical(res_cal(y, x), as.matrix(res_cal(Y, X))) + +# by parameter for within by-groups calculation +res_cal(Y, X, by = by) +identical( + res_cal(Y, X, by = by)[by == "a", ], + res_cal(Y[by == "a", ], X[by == "a", ]) +) + +} +\author{ +Martin Chevalier +} diff --git a/man/linearization_wrapper_standard.Rd b/man/standard_statistic_wrapper.Rd similarity index 52% rename from man/linearization_wrapper_standard.Rd rename to man/standard_statistic_wrapper.Rd index 249ad38..1757796 100644 --- a/man/linearization_wrapper_standard.Rd +++ b/man/standard_statistic_wrapper.Rd @@ -1,82 +1,74 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/linearization_wrapper_standard.R -\name{linearization_wrapper_standard} -\alias{linearization_wrapper_standard} -\alias{total} -\alias{ratio} -\alias{mean} -\alias{diff_or_ratio} -\alias{ratio_of_ratio} -\alias{total} -\alias{ratio} -\alias{mean} -\alias{diff_of_ratio} -\alias{ratio_of_ratio} -\title{Standard linearization wrappers} -\usage{ -total(y, by = NULL, where = NULL, ...) - -ratio(num, denom, by = NULL, where = NULL, ...) - -mean(y, by = NULL, where = NULL, ...) - -diff_of_ratio(num1, denom1, num2, denom2, by = NULL, where = NULL, ...) - -ratio_of_ratio(num1, denom1, num2, denom2, by = NULL, where = NULL, ...) -} -\arguments{ -\item{y}{A vector corresponding to the (sole) variable to estimate -variance on. If not numeric (character or factor), it is -automatically discretized.} - -\item{by}{Factor vector (character vectors are coerced to factors) whose levels are used -to break down the estimation by domains.} - -\item{where}{Logical vector indicating the domain to perform variance estimation on.} - -\item{...}{Technical parameters passed on to helper functions -within the linearization wrapper.} - -\item{num, num1, num2}{Numerical vector(s) corresponding to the numerator(s) -to be used in the estimation.} - -\item{denom, denom1, denom2}{Numerical vector(s) corresponding to the denominator(s) -to be used in the estimation.} -} -\description{ -Functions to be used within variance estimation - wrappers in order to perform on-the-fly linearizations (see Details). -} -\details{ -When the estimator is not the estimator of a total, the application of - analytical variance estimation formulae developed for the estimator of a total - is not straightforward (Deville, 1999). An asymptotically unbiased variance - estimator can nonetheless be obtained if the estimation of variance is performed - on a variable obtained from the original data through a linerization step. - - The \code{ratio}, \code{mean}, \code{diff_of_ratio} and - \code{ratio_of_ratio} functions implement the standard linearization - techniques respectively for the ratio, mean, difference of ratios and - ratio of ratios estimators, as presented for example in (Caron, 1998). - The \code{total} function does not perform any linearization - (as none is needed for the estimator of a total) and solely adds the technical - features required to use the linearization wrapper within the \code{\link[=define_variance_wrapper]{variance wrappers}}. -} -\examples{ -# See define_variance_wrapper examples - -} -\references{ -Caron N. (1998), "Le logiciel Poulpe : aspects méthodologiques", \emph{Actes - des Journées de méthodologie statistique} \url{http://jms-insee.fr/jms1998s03_1/} - - Deville J.-C. (1999), "Variance estimation for complex statistics and - estimators: linearization and residual techniques", \emph{Survey Methodology}, - 25:193–203 -} -\seealso{ -\code{\link{define_variance_wrapper}} -} -\author{ -Martin Chevalier -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/standard_statistic_wrapper.R +\name{standard_statistic_wrapper} +\alias{standard_statistic_wrapper} +\alias{total} +\alias{ratio} +\alias{mean} +\alias{diff_of_ratio} +\alias{ratio_of_ratio} +\title{Standard statistic wrappers} +\usage{ +total(y, by = NULL, where = NULL) + +ratio(num, denom, by = NULL, where = NULL) + +mean(y, by = NULL, where = NULL) + +diff_of_ratio(num1, denom1, num2, denom2, by = NULL, where = NULL) + +ratio_of_ratio(num1, denom1, num2, denom2, by = NULL, where = NULL) +} +\arguments{ +\item{y}{A vector corresponding to the variable to calculate the statitic on. +If not numeric (character or factor), it is automatically discretized.} + +\item{by}{Factor vector (character vectors are coerced to factors) whose levels are used +to break down the estimation by domains.} + +\item{where}{Logical vector indicating the domain to perform variance estimation on.} + +\item{num, num1, num2}{Numerical vector(s) corresponding to the numerator(s) +to be used in the estimation.} + +\item{denom, denom1, denom2}{Numerical vector(s) corresponding to the denominator(s) +to be used in the estimation.} +} +\description{ +Functions to be used within variance estimation + wrappers in order to specify which statistic is to be estimated. +} +\details{ +When the estimator is not the estimator of a total, the application of + analytical variance estimation formulae developed for the estimator of a total + is not straightforward (Deville, 1999). An asymptotically unbiased variance + estimator can nonetheless be obtained if the estimation of variance is performed + on a variable obtained from the original data through a linerization step. + + The \code{ratio}, \code{mean}, \code{diff_of_ratio} and \code{ratio_of_ratio} + functions produce the point estimate of the statistic and derive the + corresponding linearized variable which is later on passed on to the variance + estimation function within the variance estimation wrapper. + + Note: The \code{total} function does not perform any linearization + (as none is needed for the estimator of a total) and solely produces the + corresponding point estimator. +} +\examples{ +# See qvar examples + +} +\references{ +Caron N. (1998), "Le logiciel Poulpe : aspects méthodologiques", \emph{Actes + des Journées de méthodologie statistique} \url{http://jms-insee.fr/jms1998s03_1/} + + Deville J.-C. (1999), "Variance estimation for complex statistics and + estimators: linearization and residual techniques", \emph{Survey Methodology}, + 25:193–203 +} +\seealso{ +\code{\link{define_statistic_wrapper}}, \code{\link{define_variance_wrapper}} +} +\author{ +Martin Chevalier +} diff --git a/man/sumby.Rd b/man/sum_by.Rd similarity index 50% rename from man/sumby.Rd rename to man/sum_by.Rd index f6e81f7..1b6cf9a 100644 --- a/man/sumby.Rd +++ b/man/sum_by.Rd @@ -1,67 +1,67 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils.R -\name{sumby} -\alias{sumby} -\title{Efficient by-group (weighted) summation} -\usage{ -sumby(y, by, w = NULL, na_rm = TRUE, keep_sparse = FALSE) -} -\arguments{ -\item{y}{A (sparse) vector, a (sparse) matrix or a data.frame. -The object to perform by-group summation on.} - -\item{by}{The factor variable defining the by-groups. Character variables -are coerced to factors.} - -\item{w}{The optional weight to be used in the summation.} - -\item{na_rm}{Should NA values in y be removed (ie treated as 0 in the summation) ? -Similar to na.rm argument in \code{\link[base]{sum}}, but TRUE by default. -If FALSE, NA values in y produce NA values in the result.} - -\item{keep_sparse}{When y is a sparse vector or a sparse matrix, should the result -also be sparse ? FALSE by default. As \code{\link[Matrix]{sparseVector-class}} does -not have a name attribute, when y is a sparseVector the result does not have any -name (and a warning is cast).} -} -\value{ -A vector, a matrix or a data.frame depending on the type of y. If y is -sparse and keep_sparse is TRUE, then the result is also sparse (without names -when it is a sparse vector, see keep_sparse argument for details). -} -\description{ -\code{sumby()} performs an efficient and optionally weighted -by-group summation by using linear algebra and the Matrix package -capabilities. The by-group summation is performed through matrix cross-product -of the y parameter (coerced to a matrix if needed) with a (very) sparse -matrix built up using the by and the (optional) w parameters. - -Compared to base R, dplyr or data.table alternatives, this implementation -aims at being easier to use in a matrix-oriented context and can yield -efficiency gains when the number of columns becomes high. -} -\examples{ -# Data generation -set.seed(1) -n <- 100 -p <- 10 -H <- 3 -y <- matrix(rnorm(n*p), ncol = p, dimnames = list(NULL, paste0("var", 1:10))) -y[1, 1] <- NA -by <- letters[sample.int(H, n, replace = TRUE)] -w <- rep(1, n) -w[by == "a"] <- 2 - -# Standard use -sumby(y, by) - -# Keeping the NAs -sumby(y, by, na_rm = FALSE) - -# With a weight -sumby(y, by, w = w) - -} -\author{ -Martin Chevalier -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{sum_by} +\alias{sum_by} +\title{Efficient by-group (weighted) summation} +\usage{ +sum_by(y, by, w = NULL, na_rm = TRUE, keep_sparse = FALSE) +} +\arguments{ +\item{y}{A (sparse) vector, a (sparse) matrix or a data.frame. +The object to perform by-group summation on.} + +\item{by}{The factor variable defining the by-groups. Character variables +are coerced to factors.} + +\item{w}{The optional row weights to be used in the summation.} + +\item{na_rm}{Should \code{NA} values in \code{y} be removed (ie treated as 0 in the summation) ? +Similar to \code{na.rm} argument in \code{\link[base]{sum}}, but \code{TRUE} by default. +If \code{FALSE}, \code{NA} values in \code{y} produce \code{NA} values in the result.} + +\item{keep_sparse}{When \code{y} is a sparse vector or a sparse matrix, should the result +also be sparse ? \code{FALSE} by default. As \code{\link[Matrix]{sparseVector-class}} does +not have a name attribute, when \code{y} is a sparseVector the result does not have any +name (and a warning is cast).} +} +\value{ +A vector, a matrix or a data.frame depending on the type of \code{y}. If \code{y} is +sparse and \code{keep_sparse = TRUE}, then the result is also sparse (without names +when it is a sparse vector, see keep_sparse argument for details). +} +\description{ +\code{sum_by} performs an efficient and optionally weighted +by-group summation by using linear algebra and the Matrix package +capabilities. The by-group summation is performed through matrix cross-product +of the \code{y} parameter (coerced to a matrix if needed) with a (very) sparse +matrix built up using the \code{by} and the (optional) \code{w} parameters. + +Compared to base R, dplyr or data.table alternatives, this implementation +aims at being easier to use in a matrix-oriented context and can yield +efficiency gains when the number of columns becomes high. +} +\examples{ +# Data generation +set.seed(1) +n <- 100 +p <- 10 +H <- 3 +y <- matrix(rnorm(n*p), ncol = p, dimnames = list(NULL, paste0("var", 1:10))) +y[1, 1] <- NA +by <- letters[sample.int(H, n, replace = TRUE)] +w <- rep(1, n) +w[by == "a"] <- 2 + +# Standard use +sum_by(y, by) + +# Keeping the NAs +sum_by(y, by, na_rm = FALSE) + +# With a weight +sum_by(y, by, w = w) + +} +\author{ +Martin Chevalier +} diff --git a/man/varDT.Rd b/man/varDT.Rd index 1f77e68..2fccca0 100644 --- a/man/varDT.Rd +++ b/man/varDT.Rd @@ -1,195 +1,186 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/variance_function.R -\name{varDT} -\alias{varDT} -\alias{var_srs} -\alias{var_srs} -\title{Variance approximation with Deville-Tillé (2005) formula} -\usage{ -varDT(y = NULL, pik, x = NULL, strata = NULL, w = NULL, - collinearity.check = NULL, precalc = NULL) - -var_srs(y, pik, strata = NULL, w = NULL, precalc = NULL) -} -\arguments{ -\item{y}{A numerical matrix of the variable(s) whose variance of their total -is to be estimated. May be a Matrix::TsparseMatrix.} - -\item{pik}{A numerical vector of first-order inclusion probabilities.} - -\item{x}{An optional numerical matrix of balancing variable(s). May be a -Matrix::TsparseMatrix.} - -\item{strata}{An optional categorical vector (factor or character) when -variance estimation is to be conducted within strata.} - -\item{w}{An optional numerical vector of row weights (see Details).} - -\item{collinearity.check}{A boolean (\code{TRUE} or \code{FALSE}) or -\code{NULL} indicating whether to perform a check for collinearity or not -(see Details).} - -\item{precalc}{A list of pre-calculated results (see Details).} -} -\value{ -\itemize{ \item if \code{y} is not \code{NULL} (calculation step) : - the estimated variances as a numerical vector of size the number of - columns of y. \item if \code{y} is \code{NULL} (pre-calculation step) : a list - containing pre-calculated data: \itemize{ \item \code{pik}: the numerical vector - of first-order inclusion probabilities. \item \code{A}: the numerical matrix - denoted A in (Deville, Tillé, 2005). \item \code{ck}: the numerical vector denoted - ck2 in (Deville, Tillé, 2005). \item \code{inv}: the inverse of \code{A \%*\% - Matrix::Diagonal(x = ck) \%*\% t(A)} \item \code{diago}: the diagonal term - of the variance estimator } } -} -\description{ -\code{varDT} estimates the variance of the estimator of a total - in the case of a balanced sampling design with equal or unequal probabilities. - Without balancing variables, it falls back to Deville's (1993) classical - approximation. Without balancing variables and with equal probabilities, it - falls back to the classical Horvitz-Thompson variance estimator for the total in - the case of simple random sampling. Stratification is natively supported. - - \code{var_srs} is a convenience wrapper for the (stratified) simple random - sampling case. -} -\details{ -\code{varDT} aims at being the workhorse of most variance estimation conducted - with the \code{gustave} package. It may be used to estimate the variance - of the estimator of a total in the case of (stratified) simple random sampling, - (stratified) unequal probability sampling and (stratified) balanced sampling. - The native integration of stratification based on Matrix::TsparseMatrix allows - for significant performance gains compared to higher level vectorizations - (\code{*apply} especially). - - Several time-consuming operations (e.g. collinearity-check, matrix - inversion) can be pre-calculated in order to speed up the estimation at - execution time. This is determined by the value of the parameters \code{y} - and \code{precalc}: \itemize{ \item if \code{y} not \code{NULL} and - \code{precalc} \code{NULL} : on-the-fly calculation (no pre-calculation). - \item if \code{y} \code{NULL} and \code{precalc} \code{NULL} : - pre-calculation whose results are stored in a list of pre-calculated data. - \item if \code{y} not \code{NULL} and \code{precalc} not \code{NULL} : - calculation using the list of pre-calculated data. } - - If \code{collinearity.check} is \code{NULL}, a test for collinearity in the - independent variables (\code{x}) is conducted only if \code{det(t(x) \%*\% - x) == 0)}. - - \code{w} is a row weight used at the final summation step. It is useful - when \code{varDT} or \code{var_srs} are used on the second stage of a - two-stage sampling design applying the Rao (1975) formula. -} -\section{Difference with \code{varest} from package \code{sampling}}{ - - - \code{varDT} differs from \code{sampling::varest} in several ways: - \itemize{ \item The formula implemented in \code{varDT} is more general and - encompasses balanced sampling. \item Even in its reduced - form (without balancing variables), the formula implemented in \code{varDT} - slightly differs from the one implemented in \code{sampling::varest}. - Caron, Deville and Sautory (1998, pp. 7-8) compares the two estimators - (\code{sampling::varest} implements V_2, \code{varDT} implements V_1). - \item \code{varDT} introduces several optimizations: \itemize{ \item - matrixwise operations allow to estimate variance on several interest - variables at once \item Matrix::TsparseMatrix capability and the native - integration of stratification yield significant performance gains. \item - the ability to pre-calculate some time-consuming operations speeds up the - estimation at execution time. } \item \code{varDT} does not natively - implements the calibration estimator (i.e. the sampling variance estimator - that takes into account the effect of calibration). In the context of the - \code{gustave} package, \code{\link{rescal}} could be called before - \code{varDT} in order to achieve the same result.} -} - -\examples{ -library(sampling) -set.seed(1) - -# Simple random sampling case -N <- 1000 -n <- 100 -y <- rnorm(N)[as.logical(srswor(n, N))] -pik <- rep(n/N, n) -varDT(y, pik) -sampling::varest(y, pik = pik) -N^2 * (1 - n/N) * var(y) / n - -# Unequal probability sampling case -N <- 1000 -n <- 100 -pik <- runif(N) -s <- as.logical(UPsystematic(pik)) -y <- rnorm(N)[s] -pik <- pik[s] -varDT(y, pik) -varest(y, pik = pik) -# The small difference is expected (see above). - -# Balanced sampling case -N <- 1000 -n <- 100 -pik <- runif(N) -x <- matrix(rnorm(N*3), ncol = 3) -s <- as.logical(samplecube(x, pik)) -y <- rnorm(N)[s] -pik <- pik[s] -x <- x[s, ] -varDT(y, pik, x) - -# Balanced sampling case (variable of interest -# among the balancing variables) -N <- 1000 -n <- 100 -pik <- runif(N) -y <- rnorm(N) -x <- cbind(matrix(rnorm(N*3), ncol = 3), y) -s <- as.logical(samplecube(x, pik)) -y <- y[s] -pik <- pik[s] -x <- x[s, ] -varDT(y, pik, x) -# As expected, the total of the variable of interest is perfectly estimated. - -# strata argument -n <- 100 -H <- 2 -pik <- runif(n) -y <- rnorm(n) -strata <- letters[sample.int(H, n, replace = TRUE)] -all.equal( - varDT(y, pik, strata = strata) - , varDT(y[strata == "a"], pik[strata == "a"]) + varDT(y[strata == "b"], pik[strata == "b"]) -) - -# precalc argument -n <- 1000 -H <- 50 -pik <- runif(n) -y <- rnorm(n) -strata <- sample.int(H, n, replace = TRUE) -precalc <- varDT(y = NULL, pik, strata = strata) -identical( - varDT(y, precalc = precalc) - , varDT(y, pik, strata = strata) -) - -} -\references{ -Caron N., Deville J.-C., Sautory O. (1998), \emph{Estimation de - précision de données issues d'enquêtes : document méthodologique sur le - logiciel POULPE}, Insee working paper, n°9806 - - Deville, J.-C. (1993), \emph{Estimation de la variance pour les enquêtes en - deux phases}, Manuscript, INSEE, Paris. - - Deville, J.-C., Tillé, Y. (2005), "Variance approximation under balanced - sampling", \emph{Journal of Statistical Planning and Inference}, 128, issue - 2 569-591 - - Rao, J.N.K (1975), "Unbiased variance estimation for multistage designs", - \emph{Sankhya}, C n°37 -} -\author{ -Martin Chevalier -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/variance_function.R +\name{varDT} +\alias{varDT} +\alias{var_srs} +\title{Variance approximation with Deville-Tillé (2005) formula} +\usage{ +varDT(y = NULL, pik, x = NULL, strata = NULL, w = NULL, + precalc = NULL, id = NULL) + +var_srs(y, pik, strata = NULL, w = NULL, precalc = NULL) +} +\arguments{ +\item{y}{A (sparse) numerical matrix of the variable(s) whose variance of their total +is to be estimated.} + +\item{pik}{A numerical vector of first-order inclusion probabilities.} + +\item{x}{An optional (sparse) numerical matrix of balancing variable(s).} + +\item{strata}{An optional categorical vector (factor or character) when +variance estimation is to be conducted within strata.} + +\item{w}{An optional numerical vector of row weights (see Details).} + +\item{precalc}{A list of pre-calculated results (see Details).} + +\item{id}{A vector of identifiers of the units used in the calculation. +Useful when \code{precalc = TRUE} in order to assess whether the ordering of the +\code{y} data matrix matches the one used at the pre-calculation step.} +} +\value{ +\itemize{ \item if \code{y} is not \code{NULL} (calculation step) : + the estimated variances as a numerical vector of size the number of + columns of y. \item if \code{y} is \code{NULL} (pre-calculation step) : a list + containing pre-calculated data.} +} +\description{ +\code{varDT} estimates the variance of the estimator of a total + in the case of a balanced sampling design with equal or unequal probabilities + using Deville-Tillé (2005) formula. Without balancing variables, it falls back + to Deville's (1993) classical approximation. Without balancing variables and + with equal probabilities, it falls back to the classical Horvitz-Thompson + variance estimator for the total in the case of simple random sampling. + Stratification is natively supported. + + \code{var_srs} is a convenience wrapper for the (stratified) simple random + sampling case. +} +\details{ +\code{varDT} aims at being the workhorse of most variance estimation conducted + with the \code{gustave} package. It may be used to estimate the variance + of the estimator of a total in the case of (stratified) simple random sampling, + (stratified) unequal probability sampling and (stratified) balanced sampling. + The native integration of stratification based on Matrix::TsparseMatrix allows + for significant performance gains compared to higher level vectorizations + (\code{*apply} especially). + + Several time-consuming operations (e.g. collinearity-check, matrix + inversion) can be pre-calculated in order to speed up the estimation at + execution time. This is determined by the value of the parameters \code{y} + and \code{precalc}: \itemize{ \item if \code{y} not \code{NULL} and + \code{precalc} \code{NULL} : on-the-fly calculation (no pre-calculation). + \item if \code{y} \code{NULL} and \code{precalc} \code{NULL} : + pre-calculation whose results are stored in a list of pre-calculated data. + \item if \code{y} not \code{NULL} and \code{precalc} not \code{NULL} : + calculation using the list of pre-calculated data. } + + \code{w} is a row weight used at the final summation step. It is useful + when \code{varDT} or \code{var_srs} are used on the second stage of a + two-stage sampling design applying the Rao (1975) formula. +} +\section{Difference with \code{varest} from package \code{sampling}}{ + + + \code{varDT} differs from \code{sampling::varest} in several ways: + \itemize{ \item The formula implemented in \code{varDT} is more general and + encompasses balanced sampling. \item Even in its reduced + form (without balancing variables), the formula implemented in \code{varDT} + slightly differs from the one implemented in \code{sampling::varest}. + Caron (1998, pp. 178-179) compares the two estimators + (\code{sampling::varest} implements V_2, \code{varDT} implements V_1). + \item \code{varDT} introduces several optimizations: \itemize{ \item + matrixwise operations allow to estimate variance on several interest + variables at once \item Matrix::TsparseMatrix capability and the native + integration of stratification yield significant performance gains. \item + the ability to pre-calculate some time-consuming operations speeds up the + estimation at execution time. } \item \code{varDT} does not natively + implements the calibration estimator (i.e. the sampling variance estimator + that takes into account the effect of calibration). In the context of the + \code{gustave} package, \code{\link{res_cal}} should be called before + \code{varDT} in order to achieve the same result.} +} + +\examples{ +library(sampling) +set.seed(1) + +# Simple random sampling case +N <- 1000 +n <- 100 +y <- rnorm(N)[as.logical(srswor(n, N))] +pik <- rep(n/N, n) +varDT(y, pik) +sampling::varest(y, pik = pik) +N^2 * (1 - n/N) * var(y) / n + +# Unequal probability sampling case +N <- 1000 +n <- 100 +pik <- runif(N) +s <- as.logical(UPsystematic(pik)) +y <- rnorm(N)[s] +pik <- pik[s] +varDT(y, pik) +varest(y, pik = pik) +# The small difference is expected (see Details). + +# Balanced sampling case +N <- 1000 +n <- 100 +pik <- runif(N) +x <- matrix(rnorm(N*3), ncol = 3) +s <- as.logical(samplecube(x, pik)) +y <- rnorm(N)[s] +pik <- pik[s] +x <- x[s, ] +varDT(y, pik, x) + +# Balanced sampling case (variable of interest +# among the balancing variables) +N <- 1000 +n <- 100 +pik <- runif(N) +y <- rnorm(N) +x <- cbind(matrix(rnorm(N*3), ncol = 3), y) +s <- as.logical(samplecube(x, pik)) +y <- y[s] +pik <- pik[s] +x <- x[s, ] +varDT(y, pik, x) +# As expected, the total of the variable of interest is perfectly estimated. + +# strata argument +n <- 100 +H <- 2 +pik <- runif(n) +y <- rnorm(n) +strata <- letters[sample.int(H, n, replace = TRUE)] +all.equal( + varDT(y, pik, strata = strata), + varDT(y[strata == "a"], pik[strata == "a"]) + varDT(y[strata == "b"], pik[strata == "b"]) +) + +# precalc argument +n <- 1000 +H <- 50 +pik <- runif(n) +y <- rnorm(n) +strata <- sample.int(H, n, replace = TRUE) +precalc <- varDT(y = NULL, pik, strata = strata) +identical( + varDT(y, precalc = precalc), + varDT(y, pik, strata = strata) +) + +} +\references{ +Caron N. (1998), "Le logiciel Poulpe : aspects méthodologiques", \emph{Actes + des Journées de méthodologie statistique} \url{http://jms-insee.fr/jms1998s03_1/} + Deville, J.-C. (1993), \emph{Estimation de la variance pour les enquêtes en + deux phases}, Manuscript, INSEE, Paris. + + Deville, J.-C., Tillé, Y. (2005), "Variance approximation under balanced + sampling", \emph{Journal of Statistical Planning and Inference}, 128, issue + 2 569-591 + + Rao, J.N.K (1975), "Unbiased variance estimation for multistage designs", + \emph{Sankhya}, C n°37 +} +\seealso{ +\code{\link{res_cal}} +} +\author{ +Martin Chevalier +} diff --git a/man/varSYG.Rd b/man/varSYG.Rd index d56fb98..c6bf9f4 100644 --- a/man/varSYG.Rd +++ b/man/varSYG.Rd @@ -1,68 +1,67 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/variance_function.R -\name{varSYG} -\alias{varSYG} -\title{Sen-Yates-Grundy variance estimator} -\usage{ -varSYG(y = NULL, pikl, precalc = NULL) -} -\arguments{ -\item{y}{A numerical matrix of the variable(s) whose variance of their total -is to be estimated. May be a Matrix::TsparseMatrix.} - -\item{pikl}{A numerical matrix of second-order inclusion probabilities.} - -\item{precalc}{A list of pre-calculated results (analogous to the one used by -\code{\link{varDT}}).} -} -\value{ -\itemize{ \item if \code{y} is not \code{NULL} (calculation step) : a - numerical vector of size the number of columns of y. \item if \code{y} is - \code{NULL} (pre-calculation step) : a list containing pre-calculated data - (analogous to the one used by \code{\link{varDT}}).} -} -\description{ -\code{varSYG} computes the Sen-Yates-Grundy -variance estimator (valid under the assumption that the sampling -design is of fixed size). -} -\details{ -\code{varSYG} aims at being an efficient implementation of the - Sen-Yates-Grundy variance estimator for sampling designs with fixed sample - size. It should be especially useful when several variance estimations are - to be conducted, as it relies on (sparse) matrix linear algebra. - - Moreover, in order to be consistent with \code{\link{varDT}}, \code{varSYG} - has a \code{precalc} argument allowing for the re-use of intermediary - results calculated once and for all in a pre-calculation step (see - \code{\link{varDT}} for details). -} -\section{Difference with \code{varHT} from package \code{sampling}}{ - - - \code{varSYG} differs from \code{sampling::varHT} in several ways: - \itemize{ \item The formula implemented in \code{varSYG} is solely - the Sen-Yates-Grundy estimator, which is the one calculated - by \code{varHT} when method = 2. - \item \code{varSYG} introduces several optimizations: \itemize{ \item - matrixwise operations allow to estimate variance on several interest - variables at once \item Matrix::TsparseMatrix capability yields significant - performance gains.}} -} - -\examples{ -library(sampling) -set.seed(1) - -# Simple random sampling case -N <- 1000 -n <- 100 -y <- rnorm(N)[as.logical(srswor(n, N))] -pikl <- matrix(rep((n*(n-1))/(N*(N-1)), n*n), nrow = n) -diag(pikl) <- rep(n/N, n) -varSYG(y, pikl) -sampling::varHT(y = y, pikl = pikl, method = 2) -} -\author{ -Martin Chevalier (Insee, French Statistical Institute) -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/variance_function.R +\name{varSYG} +\alias{varSYG} +\title{Sen-Yates-Grundy variance estimator} +\usage{ +varSYG(y = NULL, pikl, precalc = NULL) +} +\arguments{ +\item{y}{A (sparse) numerical matrix of the variable(s) whose variance of their total +is to be estimated.} + +\item{pikl}{A numerical matrix of second-order inclusion probabilities.} + +\item{precalc}{A list of pre-calculated results (analogous to the one used by +\code{\link{varDT}}).} +} +\value{ +\itemize{ \item if \code{y} is not \code{NULL} (calculation step) : a + numerical vector of size the number of columns of y. \item if \code{y} is + \code{NULL} (pre-calculation step) : a list containing pre-calculated data + (analogous to the one used by \code{\link{varDT}}).} +} +\description{ +\code{varSYG} computes the Sen-Yates-Grundy +variance estimator. +} +\details{ +\code{varSYG} aims at being an efficient implementation of the + Sen-Yates-Grundy variance estimator for sampling designs with fixed sample + size. It should be especially useful when several variance estimations are + to be conducted, as it relies on (sparse) matrix linear algebra. + + Moreover, in order to be consistent with \code{\link{varDT}}, \code{varSYG} + has a \code{precalc} argument allowing for the re-use of intermediary + results calculated once and for all in a pre-calculation step (see + \code{\link{varDT}} for details). +} +\section{Difference with \code{varHT} from package \code{sampling}}{ + + + \code{varSYG} differs from \code{sampling::varHT} in several ways: + \itemize{ \item The formula implemented in \code{varSYG} is solely + the Sen-Yates-Grundy estimator, which is the one calculated + by \code{varHT} when method = 2. + \item \code{varSYG} introduces several optimizations: \itemize{ \item + matrixwise operations allow to estimate variance on several interest + variables at once \item Matrix::TsparseMatrix capability yields significant + performance gains.}} +} + +\examples{ +library(sampling) +set.seed(1) + +# Simple random sampling case +N <- 1000 +n <- 100 +y <- rnorm(N)[as.logical(srswor(n, N))] +pikl <- matrix(rep((n*(n-1))/(N*(N-1)), n*n), nrow = n) +diag(pikl) <- rep(n/N, n) +varSYG(y, pikl) +sampling::varHT(y = y, pikl = pikl, method = 2) +} +\author{ +Martin Chevalier +} diff --git a/man/var_pois.Rd b/man/var_pois.Rd index 0cc93ca..419f310 100644 --- a/man/var_pois.Rd +++ b/man/var_pois.Rd @@ -1,36 +1,36 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/variance_function.R -\name{var_pois} -\alias{var_pois} -\title{Variance estimator for a Poisson sampling design} -\usage{ -var_pois(y, pik, w = NULL) -} -\arguments{ -\item{y}{A numerical matrix of the variable(s) whose variance of their total -is to be estimated. May be a Matrix::TsparseMatrix.} - -\item{pik}{A numerical vector of first-order inclusion probabilities.} - -\item{w}{An optional numerical vector of row weights (see Details).} -} -\value{ -The estimated variances as a numerical vector of size the number of - columns of y. -} -\description{ -\code{var_pois} estimates the variance of the estimator -of a total for a Poisson sampling design. -} -\details{ -\code{w} is a row weight used at the final summation step. It is useful - when \code{var_pois} is used on the second stage of a two-stage sampling - design applying the Rao (1975) formula. -} -\references{ -Rao, J.N.K (1975), "Unbiased variance estimation for multistage designs", - \emph{Sankhya}, C n°37 -} -\author{ -Martin Chevalier -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/variance_function.R +\name{var_pois} +\alias{var_pois} +\title{Variance estimator for a Poisson sampling design} +\usage{ +var_pois(y, pik, w = rep(1, length(pik))) +} +\arguments{ +\item{y}{A (sparse) numerical matrix of the variable(s) whose variance of their total +is to be estimated.} + +\item{pik}{A numerical vector of first-order inclusion probabilities.} + +\item{w}{An optional numerical vector of row weights (see Details).} +} +\value{ +The estimated variances as a numerical vector of size the number of + columns of \code{y}. +} +\description{ +\code{var_pois} estimates the variance of the estimator +of a total for a Poisson sampling design. +} +\details{ +\code{w} is a row weight used at the final summation step. It is useful + when \code{var_pois} is used on the second stage of a two-stage sampling + design applying the Rao (1975) formula. +} +\references{ +Rao, J.N.K (1975), "Unbiased variance estimation for multistage designs", + \emph{Sankhya}, C n°37 +} +\author{ +Martin Chevalier +} diff --git a/tests/testthat.R b/tests/testthat.R index 8643c29..79de22f 100644 --- a/tests/testthat.R +++ b/tests/testthat.R @@ -1,4 +1,4 @@ -library(testthat) -library(gustave) - -test_check("gustave") +library(testthat) +library(gustave) + +test_check("gustave") diff --git a/tests/testthat/test_data.R b/tests/testthat/test_data.R index 1ce3394..45c3a9d 100644 --- a/tests/testthat/test_data.R +++ b/tests/testthat/test_data.R @@ -1,16 +1,16 @@ - -context("data") - -test_that("non-response correction and calibration went well", { - expect_equal(sum(ict_sample$w_sample), sum(ict_sample$w_nr[ict_sample$resp])) - expect_equal(sum(ict_sample$w_sample), sum(ict_survey$w_nr)) - expect_equal(sum(ict_sample$w_sample), sum(ict_survey$w_calib)) - expect_equal( - tapply(ict_pop$turnover, ict_pop$division, sum), - tapply(ict_survey$turnover * ict_survey$w_calib, ict_survey$division, sum) - ) - expect_equal( - tapply(ict_pop$firm_id, ict_pop$division, length), - tapply(ict_survey$w_calib, ict_survey$division, sum) - ) + +context("data") + +test_that("example data are OK", { + expect_equal(sum(ict_sample$w_sample[ict_sample$scope]), sum(ict_sample$w_nrc[ict_sample$resp])) + expect_true(all(ict_sample$response_prob_est <= 1, na.rm = TRUE)) + expect_equal(sum(ict_sample$w_sample), sum(ict_sample$w_calib)) + expect_equal( + tapply(ict_pop$turnover, ict_pop$division, sum), + tapply(ict_sample$turnover * ict_sample$w_calib, ict_sample$division, sum) + ) + expect_equal( + tapply(ict_pop$firm_id, ict_pop$division, length), + tapply(ict_sample$w_calib, ict_sample$division, sum) + ) }) \ No newline at end of file diff --git a/tests/testthat/test_define_statistic_wrapper.R b/tests/testthat/test_define_statistic_wrapper.R new file mode 100644 index 0000000..91120a7 --- /dev/null +++ b/tests/testthat/test_define_statistic_wrapper.R @@ -0,0 +1,135 @@ + +context("define_statistic_wrapper") + +variance_wrapper <- define_variance_wrapper( + variance_function = function(y) abs(colSums(y)), + reference_id = ict_survey$firm_id, + reference_weight = ict_survey$w_calib, + default_id = "firm_id" +) + +test_that("standard and non-standard evaluation yields the same results", { + + expect_identical( + variance_wrapper(ict_survey, speed_quanti), + variance_wrapper(ict_survey, "speed_quanti") + ) + expect_identical( + variance_wrapper(ict_survey, mean(speed_quanti)), + variance_wrapper(ict_survey, mean("speed_quanti")) + ) + + assign(x = "var", value = "speed_quanti", envir = globalenv()) + expect_identical( + variance_wrapper(ict_survey, mean(speed_quanti)), + variance_wrapper(ict_survey, mean(var)) + ) + + assign(x = "var", value = c("speed_quanti", "speed_quali"), envir = globalenv()) + expect_identical( + variance_wrapper(ict_survey, mean(speed_quanti), mean(speed_quali)), + variance_wrapper(ict_survey, mean(var)) + ) + speed_quanti2 <- ict_survey$speed_quanti + expect_identical( + variance_wrapper(ict_survey, mean(speed_quanti))[ - 1], + variance_wrapper(ict_survey, mean(speed_quanti2))[ - 1] + ) + + assign(x = "num", value = "turnover", envir = globalenv()) + assign(x = "denom", value = "employees", envir = globalenv()) + expect_identical( + variance_wrapper(ict_survey, ratio(turnover, employees)), + variance_wrapper(ict_survey, ratio(num, denom)) + ) + assign(x = "num", value = c("turnover", "employees"), envir = globalenv()) + assign(x = "denom", value = c("employees", "turnover"), envir = globalenv()) + expect_identical( + variance_wrapper(ict_survey, ratio(turnover, employees), ratio(employees, turnover)), + variance_wrapper(ict_survey, ratio(num, denom)) + ) + + expect_identical( + variance_wrapper(ict_survey, total(speed_quanti), by = division), + variance_wrapper(ict_survey, total(speed_quanti, by = division)) + ) + expect_identical( + variance_wrapper(ict_survey, total(speed_quanti))[, -1], + variance_wrapper(ict_survey, total(speed_quanti, by = NULL), by = division)[, -1] + ) + expect_identical( + variance_wrapper(ict_survey, total(speed_quanti), by = division), + variance_wrapper(ict_survey, total(speed_quanti), by = "division") + ) + expect_identical( + variance_wrapper(ict_survey, total(speed_quanti, by = division)), + variance_wrapper(ict_survey, total(speed_quanti, by = "division")) + ) + + ict_survey$domain <- ict_survey$division == "59" + expect_identical( + variance_wrapper(ict_survey, total(speed_quanti), where = division == "59")[, -1], + variance_wrapper(ict_survey, total(speed_quanti), where = "domain")[, -1] + ) + rm(ict_survey) + +}) + +test_that("non-standard evaluation works when a character vector with same name exists outside of data", { + speed_quanti <- "blabla" + expect_error( + variance_wrapper(ict_survey, speed_quanti), + regexp = NA + ) +}) + + + + +# Define a new statistic_wrapper and include it in the variance_wrapper + +total2 <- define_statistic_wrapper( + statistic_function = function(y, w, w2){ + na <- is.na(y) + y[na] <- 0 + point <- sum(y * w) + list(point = point, lin = y, metadata = list(n = sum(!na))) + }, + arg_type = list(data = "y" , weight = c("w", "w2")), + arg_not_affected_by_domain = "w2" +) + +variance_wrapper <- define_variance_wrapper( + variance_function = function(y) abs(colSums(y)), + reference_id = ict_survey$firm_id, + reference_weight = ict_survey$w_calib, + default_id = "firm_id", + objects_to_include = "total2" +) +rm(total2) + + + +# Define a statistic_wrapper that produces two linearized variables +total3 <- define_statistic_wrapper( + statistic_function = function(y, w){ + na <- is.na(y) + y[na] <- 0 + point <- sum(y * w) + list(point = point, lin = list(y, y), metadata = list(n = sum(!na))) + }, + arg_type = list(data = "y" , weight = "w") +) +variance_wrapper <- define_variance_wrapper( + variance_function = function(y) abs(colSums(y)), + reference_id = ict_survey$firm_id, + reference_weight = ict_survey$w_calib, + default_id = "firm_id", + objects_to_include = "total3" +) +test_that("a statistical wrapper producing more than one linearized variables is handled correctly", { + expect_error( + variance_wrapper(ict_survey, total3(speed_quanti)), + regexp = "The number of estimated variances does not match the number of point estimates. A specific display function could be needed." + ) +}) \ No newline at end of file diff --git a/tests/testthat/test_define_variance_wrapper.R b/tests/testthat/test_define_variance_wrapper.R index 8be2e80..5ffee03 100644 --- a/tests/testthat/test_define_variance_wrapper.R +++ b/tests/testthat/test_define_variance_wrapper.R @@ -1,84 +1,285 @@ - - -context("define_variance_wrapper") - -test_that("variance_wrapper can be defined in globalenv()", { - expect_error( - variance_wrapper <<- define_variance_wrapper( - variance_function = function(y) abs(colSums(y)), - reference_id = ict_survey$firm_id, - default = list(id = "firm_id", weight = "w_calib", stat = "mean") - ), - regexp = NA) - expect_error(variance_wrapper(ict_survey, speed_quanti), regexp = NA) -}) - - -test_that("varwrap_test can be defined in another function", { - expect_error({ - preparation_function <- function(){ - a <- 1 - define_variance_wrapper( - variance_function = function(y) abs(colSums(y)) + 1, - reference_id = ict_survey$firm_id, - default = list(id = "firm_id", weight = "w_calib", stat = "mean"), - objects_to_include = "a" - ) - } - variance_wrapper2 <<- preparation_function() - }, regexp = NA) - expect_error(variance_wrapper2(ict_survey, speed_quanti), regexp = NA) - expect_equal( - variance_wrapper(ict_survey, speed_quanti)$variance + 1, - variance_wrapper2(ict_survey, speed_quanti)$variance - ) -}) - - -test_that("variance_wrapper works in common situations", { - expect_error(variance_wrapper(ict_survey, speed_quanti), regexp = NA) - expect_error(variance_wrapper(ict_survey, speed_quanti_NA), regexp = NA) - expect_error(variance_wrapper(ict_survey, speed_quali), regexp = NA) - expect_error(variance_wrapper(ict_survey, speed_quali_NA), regexp = NA) - expect_error(variance_wrapper(ict_survey, big_data), regexp = NA) - expect_error(variance_wrapper(ict_survey, big_data_NA), regexp = NA) - expect_error(variance_wrapper(ict_survey, speed_quanti, by = division), regexp = NA) - expect_error(variance_wrapper(ict_survey, big_data, by = speed_quali_NA), regexp = NA) - expect_error(variance_wrapper(ict_survey, big_data, NULL), regexp = NA) -}) - - -test_that("expected error messages do appear", { - expect_error(variance_wrapper(ict_survey), "No variable to estimate variance on.") -}) - - -test_that("point estimates do match by-hand estimators", { - expect_equal( - variance_wrapper(ict_survey, total(speed_quanti_NA))$est, - sum(ict_survey$speed_quanti_NA * ict_survey$w_calib, na.rm = TRUE) - ) - expect_equal( - variance_wrapper(ict_survey, mean(speed_quanti_NA))$est, - weighted.mean(ict_survey$speed_quanti_NA, ict_survey$w_calib, na.rm = TRUE) - ) - expect_equal( - variance_wrapper(ict_survey, mean(speed_quanti_NA), by = division)$est, - as.vector(sapply(split(ict_survey, ict_survey$division), function(x) - weighted.mean(x$speed_quanti_NA, x$w_calib, na.rm = TRUE) - )) - ) - expect_equal( - variance_wrapper(ict_survey, total(speed_quali))$est, - as.vector(tapply(ict_survey$w_calib, ict_survey$speed_quali, sum)) - ) -}) - -test_that("estimated values do match reference values", { - expect_equal(variance_wrapper(ict_survey, speed_quanti_NA)$est, 32.80242, tolerance = 1e-4) - expect_equal(variance_wrapper(ict_survey, speed_quanti_NA)$variance, 0.230266, tolerance = 1e-7) - expect_equal(variance_wrapper(ict_survey, speed_quali_NA)$est, c(0.02834094, 0.32141225, 0.39763176, 0.13490418, 0.11771088), tolerance = 1e-8) - expect_equal(variance_wrapper(ict_survey, speed_quali_NA)$variance, c(0.0004749795, 0.0008654360, 0.0012040079, 0.0013058744, 0.0012385490), tolerance = 1e-8) - expect_equal(variance_wrapper(ict_survey, big_data_NA, by = speed_quali_NA)$est, c(0.00000000, 0.01416254, 0.00000000, 0.28487253, 0.30151894), tolerance = 1e-8) - expect_equal(variance_wrapper(ict_survey, big_data_NA, by = speed_quali_NA)$variance, c(0.0000000000, 0.0003726230, 0.0000000000, 0.0003226717, 0.0015129483), tolerance = 1e-8) -}) + + +context("define_variance_wrapper") + +test_that("common error messages do work", { + expect_error( + define_variance_wrapper(), + regexp = "The following arguments are missing: variance_function, reference_id, reference_weight." + ) + expect_error( + define_variance_wrapper(variance_function = function(y) abs(colSums(y))), + regexp = "The following arguments are missing: reference_id, reference_weight." + ) + expect_error( + define_variance_wrapper( + variance_function = function(y) abs(colSums(y)), + reference_id = "firm_id" + ), + regexp = "The following arguments are missing: reference_weight." + ) +}) + +test_that("variance_wrapper can be defined in globalenv()", { + expect_error({ + variance_wrapper <- define_variance_wrapper( + variance_function = function(y) abs(colSums(y)), + reference_id = ict_survey$firm_id, + reference_weight = ict_survey$w_calib, + default_id = "firm_id" + ) + variance_wrapper(ict_survey, speed_quanti) + }, regexp = NA) +}) + +test_that("common error messages do work", { + variance_wrapper <- define_variance_wrapper( + variance_function = function(y) abs(col.tableSums(y)), + reference_id = ict_survey$firm_id, + reference_weight = ict_survey$w_calib, + default_id = "firm_id" + ) + expect_error(variance_wrapper(), regexp = "The following arguments are missing: data.") + variance_wrapper <- define_variance_wrapper( + variance_function = function(y) abs(colSums(y)), + reference_id = ict_survey$firm_id, + reference_weight = ict_survey$w_calib + ) + expect_error(variance_wrapper(), regexp = "The following arguments are missing: data, id.") + expect_error(variance_wrapper(ict_survey), regexp = "The following arguments are missing: id.") +}) + +# TODO: Add tests about technical_data and technical_param + +test_that("variance_wrapper can be defined in another function", { + expect_error({ + preparation_function <- function(){ + a <- 1 + define_variance_wrapper( + variance_function = function(y, a) abs(colSums(y)) + a, + reference_id = ict_survey$firm_id, + reference_weight = ict_survey$w_calib, + technical_data = list(a = a), + default_id = "firm_id" + ) + } + variance_wrapper2 <- preparation_function() + variance_wrapper2(ict_survey, speed_quanti) + }, regexp = NA) + variance_wrapper <- define_variance_wrapper( + variance_function = function(y) abs(colSums(y)), + reference_id = ict_survey$firm_id, + reference_weight = ict_survey$w_calib, + default_id = "firm_id" + ) + expect_equal( + variance_wrapper(ict_survey, speed_quanti)$variance + 1, + variance_wrapper2(ict_survey, speed_quanti)$variance + ) +}) + +test_that("variance_wrapper may use a reference_id and a reference_weights specified as an unevaluated expression", { + expect_error({ + reference_id_list <- list(firm = ict_survey$firm_id) + reference_weight_list <- list(firm = ict_survey$w_calib) + variance_wrapper <- define_variance_wrapper( + variance_function = function(y, level) abs(colSums(y)), + reference_id = quote(reference_id_list[[level]]), + reference_weight = quote(reference_weight_list[[level]]), + default_id = "firm_id", + technical_param = list(level = "firm"), + objects_to_include = c("reference_id_list", "reference_weight_list") + ) + rm(reference_id_list, reference_weight_list) + variance_wrapper(ict_survey, speed_quanti) + }, regexp = NA) +}) + +test_that("variance_wrapper may use a default id specified as an unevaluated expression", { + expect_error({ + variance_wrapper <- define_variance_wrapper( + variance_function = function(y) abs(colSums(y)), + reference_id = ict_survey$firm_id, + reference_weight = ict_survey$w_calib, + default_id = quote(paste0(firm_id, "")) + ) + variance_wrapper(ict_survey, speed_quanti) + }, regexp = NA) + expect_error({ + default_id_list <- list(firm = quote(paste0(firm_id, ""))) + variance_wrapper <- define_variance_wrapper( + variance_function = function(y, level) abs(colSums(y)), + reference_id = ict_survey$firm_id, + reference_weight = ict_survey$w_calib, + default_id = quote(default_id_list[[level]]), + technical_param = list(level = "firm"), + objects_to_include = "default_id_list" + ) + rm(default_id_list) + variance_wrapper(ict_survey, speed_quanti) + }, regexp = NA) +}) + + +test_that("a variance wrapper may be applied on the sample file raising a warning", { + expect_warning({ + variance_wrapper <- define_variance_wrapper( + variance_function = function(y) abs(colSums(y)), + reference_id = ict_survey$firm_id, + reference_weight = ict_survey$w_calib, + default_id = "firm_id" + ) + variance_wrapper(ict_sample, turnover) + }, regexp = "observations do not match any responding units of the survey.") +}) + + +test_that("variance estimation is not affected by the sorting of the survey file", { + variance_wrapper <- define_variance_wrapper( + variance_function = function(y) abs(colSums(y)), + reference_id = ict_survey$firm_id, + reference_weight = ict_survey$w_calib, + default_id = "firm_id" + ) + expect_warning( + variance_wrapper(ict_survey[NROW(ict_survey):1, ], turnover), + regexp = "The inputted id variable \\(id argument\\) appears not to match" + ) + expect_equal( + variance_wrapper(ict_survey, turnover), + suppressWarnings(variance_wrapper(ict_survey[NROW(ict_survey):1, ], turnover)) + ) +}) + +test_that("NULL is handled correctly", { + variance_wrapper <- define_variance_wrapper( + variance_function = function(y) abs(colSums(y)), + reference_id = ict_survey$firm_id, + reference_weight = ict_survey$w_calib, + default_id = "firm_id" + ) + expect_error({ + sapply(2016:2017, function(annee){ + variance_wrapper(ict_survey, if(annee == 2016) turnover else NULL) + }) + }, + regexp = "No variable to estimate variance on." + ) +}) + + +test_that("variance_wrapper cannot work if the output of variance_function is not properly specified", { + expect_error({ + variance_wrapper <- define_variance_wrapper( + variance_function = function(y) list(blabla = abs(colSums(y))), + reference_id = ict_survey$firm_id, + reference_weight = ict_survey$w_calib, + default_id = "firm_id" + ) + variance_wrapper(ict_survey, speed_quanti) + }, regexp = "At least one output of variance_function should be named \"var\".") + expect_error({ + variance_wrapper <- define_variance_wrapper( + variance_function = function(y) matrix(abs(colSums(y)), nrow = 1), + reference_id = ict_survey$firm_id, + reference_weight = ict_survey$w_calib, + default_id = "firm_id" + ) + variance_wrapper(ict_survey, speed_quanti) + }, regexp = "The output of variance_function should be a vector.") + expect_error({ + variance_wrapper <- define_variance_wrapper( + variance_function = function(y) list(var = matrix(abs(colSums(y)), nrow = 1)), + reference_id = ict_survey$firm_id, + reference_weight = ict_survey$w_calib, + default_id = "firm_id" + ) + variance_wrapper(ict_survey, speed_quanti) + }, regexp = "The \"var\" output of variance_function should be a vector.") +}) + +test_that("variance_wrapper works when used on a data.table or a tibble", { + variance_wrapper <- define_variance_wrapper( + variance_function = function(y) abs(colSums(y)), + reference_id = ict_survey$firm_id, + reference_weight = ict_survey$w_calib, + default_id = "firm_id" + ) + expect_error(variance_wrapper(data.table::as.data.table(ict_survey), speed_quanti), regexp = NA) + expect_error(variance_wrapper(tibble::as.tibble(ict_survey), speed_quanti_NA), regexp = NA) +}) + + +test_that("variance_wrapper works in common situations", { + variance_wrapper <- define_variance_wrapper( + variance_function = function(y) abs(colSums(y)), + reference_id = ict_survey$firm_id, + reference_weight = ict_survey$w_calib, + default_id = "firm_id" + ) + expect_error(variance_wrapper(ict_survey, speed_quanti), regexp = NA) + expect_error(variance_wrapper(ict_survey, speed_quanti_NA), regexp = NA) + expect_error(variance_wrapper(ict_survey, speed_quali), regexp = NA) + expect_error(variance_wrapper(ict_survey, speed_quali_NA), regexp = NA) + expect_error(variance_wrapper(ict_survey, big_data), regexp = NA) + expect_error(variance_wrapper(ict_survey, big_data_NA), regexp = NA) + expect_error(variance_wrapper(ict_survey, speed_quanti, by = division), regexp = NA) + expect_error(variance_wrapper(ict_survey, big_data, by = speed_quali_NA), regexp = NA) + expect_error(variance_wrapper(ict_survey, big_data, NULL), regexp = NA) +}) + + +test_that("expected error messages do appear", { + variance_wrapper <- define_variance_wrapper( + variance_function = function(y) abs(colSums(y)), + reference_id = ict_survey$firm_id, + reference_weight = ict_survey$w_calib, + default_id = "firm_id" + ) + expect_error(variance_wrapper(ict_survey), "No variable to estimate variance on.") +}) + + +test_that("point estimates do match by-hand estimators", { + variance_wrapper <- define_variance_wrapper( + variance_function = function(y) abs(colSums(y)), + reference_id = ict_survey$firm_id, + reference_weight = ict_survey$w_calib, + default_id = "firm_id" + ) + variance_wrapper(ict_survey, total(speed_quali_NA))$est + expect_equal( + variance_wrapper(ict_survey, total(speed_quanti_NA))$est, + sum(ict_survey$speed_quanti_NA * ict_survey$w_calib, na.rm = TRUE) + ) + expect_equal( + variance_wrapper(ict_survey, mean(speed_quanti_NA))$est, + weighted.mean(ict_survey$speed_quanti_NA, ict_survey$w_calib, na.rm = TRUE) + ) + expect_equal( + variance_wrapper(ict_survey, mean(speed_quanti_NA), by = division)$est, + as.vector(sapply(split(ict_survey, ict_survey$division), function(x) + weighted.mean(x$speed_quanti_NA, x$w_calib, na.rm = TRUE) + )) + ) + expect_equal( + variance_wrapper(ict_survey, total(speed_quali))$est, + as.vector(tapply(ict_survey$w_calib, ict_survey$speed_quali, sum)) + ) +}) + +test_that("estimated values do match reference values", { + variance_wrapper <- define_variance_wrapper( + variance_function = function(y) abs(colSums(y)), + reference_id = ict_survey$firm_id, + reference_weight = ict_survey$w_calib, + default_id = "firm_id" + ) + expect_equal(variance_wrapper(ict_survey, speed_quanti_NA)$est, 178409.7, tolerance = 1e-0) + expect_equal(variance_wrapper(ict_survey, speed_quanti_NA)$variance, 15817, tolerance = 1e-0) + expect_equal(variance_wrapper(ict_survey, speed_quali_NA)$est, c(154, 1748, 2163, 734, 640), tolerance = 1e0) + expect_equal(variance_wrapper(ict_survey, speed_quali_NA)$variance, c(10, 138, 170, 67, 59), tolerance = 1e0) + expect_equal(variance_wrapper(ict_survey, big_data_NA, by = speed_quali_NA)$est, c(0, 18.5, 0, 164.8, 146.1), tolerance = 1e0) + expect_equal(variance_wrapper(ict_survey, big_data_NA, by = speed_quali_NA)$variance, c(0, 1, 0, 15, 14), tolerance = 1e-0) +}) + diff --git a/tests/testthat/test_qvar.R b/tests/testthat/test_qvar.R new file mode 100644 index 0000000..1285e0a --- /dev/null +++ b/tests/testthat/test_qvar.R @@ -0,0 +1,653 @@ + +context("qvar") + +technical_data_ict <- list( + samp = list( + id = ict_sample$firm_id, + exclude = rep(FALSE, NROW(ict_sample)), + precalc = suppressWarnings(var_srs( + y = NULL, pik = 1 / ict_sample$w_sample, strata = ict_sample$strata + )) + ), + nrc = list( + id = ict_sample$firm_id[ict_sample$resp & ict_sample$nrc], + response_prob = ict_sample$response_prob_est[ict_sample$resp & ict_sample$nrc], + sampling_weight = ict_sample$w_sample[ict_sample$resp & ict_sample$nrc] + ), + calib = list( + id = ict_sample$firm_id[ict_sample$calib], + precalc = res_cal(y = NULL, + x = as.matrix(ict_sample[ + ict_sample$calib, + c(paste0("N_", 58:63), paste0("turnover_", 58:63)) + ]), + w = ict_sample$w_calib[ict_sample$calib], + id = ict_sample$firm_id + ) + ) +) + + +y <- matrix(ict_survey$speed_quanti, dimnames = list(ict_survey$firm_id)) + + +test_that("qvar_variance_function works", { + expect_error( + with(technical_data_ict, qvar_variance_function(y, samp = samp, nrc = nrc, calib = calib)), + regexp = NA + ) +}) + +test_that("a variance wrapper can be manually defined on top of qvar_variance_function", { + expect_error({ + variance_wrapper_ict <- define_variance_wrapper( + variance_function = qvar_variance_function, + reference_id = ict_survey$firm_id, + reference_weight = ict_survey$w_calib, + technical_data = technical_data_ict, + default_id = "firm_id" + ) + variance_wrapper_ict(ict_survey, speed_quanti) + }, + regexp = NA + ) +}) + +test_that("Step 1.1: Arguments consistency", { + expect_error( + qvar(), + regexp = "The following arguments are missing: data, id, dissemination_dummy, dissemination_weight, sampling_weight." + ) + expect_error( + qvar( + data = blabla, id = "blabla", dissemination_dummy = "blabla", dissemination_weight = "blabla", + sampling_weight = "blabla", + nrc_weight = "blabla" + ), regexp = "weights after non-response" + ) + expect_error( + qvar( + data = ict_sample, id = "blabla", dissemination_dummy = "blabla", dissemination_weight = "blabla", + sampling_weight = "blabla", + calibration_dummy = "blabla" + ), regexp = "a variable indicating the units taking part" + ) + expect_error( + qvar( + data = ict_sample, id = "blabla", dissemination_dummy = "blabla", dissemination_weight = "blabla", + sampling_weight = "blabla", + calibration_weight = "blabla" + ), regexp = "calibrated weights are provided" + ) + expect_error( + qvar( + data = ict_sample, id = "blabla", dissemination_dummy = "blabla", dissemination_weight = "blabla", + sampling_weight = "blabla", + nrc_dummy = "blabla" + ), regexp = "a variable indicating responding units and/or a variable" + ) +}) + +test_that("Step 1.2: Welcome message", { + skip("skip") + welcome <- "Variance wrapper definition using the dataset : blabla\n\nThe following features are taken into account:" + expect_message( + qvar( + data = blabla, + sampling_weight = "blabla" + ), regexp = paste0(welcome, "\n - simple random sampling WITHOUT stratification") + ) + expect_message( + qvar( + data = blabla, + sampling_weight = "blabla", strata = "blabla" + ), regexp = tmp <- paste0(welcome, "\n - stratified simple random sampling") + ) + expect_message( + qvar( + data = blabla, + sampling_weight = "blabla", strata = "blabla", + scope_dummy = "blabla" + ), regexp = tmp <- paste0(tmp, "\n - out-of-scope units") + ) + expect_message( + qvar( + data = blabla, + sampling_weight = "blabla", strata = "blabla", + scope_dummy = "blabla", + nrc_weight = "blabla", resp = "blabla" + ), regexp = tmp <- paste0(tmp, "\n - non-response correction through reweighting") + ) + expect_message( + qvar( + data = blabla, + sampling_weight = "blabla", strata = "blabla", + scope_dummy = "blabla", + nrc_weight = "blabla", resp = "blabla", + calibration_weight = "blabla", calibration_var = "blabla" + ), regexp = paste0(tmp, "\n - calibration on margins") + ) +}) + +test_that("Step 2: Control that arguments do exist and retrive their value", { + expect_error( + qvar( + data = blabla, + id = "blabla", dissemination_dummy = "blabla", dissemination_weight = "blabla", + sampling_weight = "blabla", + define = TRUE + ), + regexp = "obj" + ) + expect_error( + qvar( + data = matrix(1:10), + id = "blabla", dissemination_dummy = "blabla", dissemination_weight = "blabla", + sampling_weight = "blabla", + define = TRUE + ), + regexp = "data argument must refer to a data.frame" + ) + expect_error( + qvar( + data = ict_sample, + id = "firm_id", dissemination_dummy = "dissemination", dissemination_weight = "w_sample", + sampling_weight = "w_sample", strata = "strata", + define = TRUE + ), + regexp = NA + ) + expect_error( + qvar( + data = ict_sample, + id = "blabla", dissemination_dummy = "dissemination", dissemination_weight = "w_calib", + sampling_weight = c("blabla", "bloblo"), + define = TRUE + ), + regexp = "The following arguments do not refer to a variable name" + ) + expect_error( + qvar( + data = ict_sample, + id = "blabla", dissemination_dummy = "dissemination", dissemination_weight = "w_calib", + sampling_weight = "blabla", + calibration_weight = "blabla", calibration_var = 2, + define = TRUE + ), + regexp = "The following arguments do not refer to a vector of variable names" + ) + expect_error( + qvar( + data = ict_sample, + id = "firm_id", dissemination_dummy = "dissemination", dissemination_weight = "w_nrc", + sampling_weight = "w_sample", strata = "strata", + nrc_weight = "w_nrc", response_dummy = "blabla", + define = TRUE + ), + regexp = "Some variables do not exist in ict_sample: \n - response_dummy argument: blabla" + ) +}) + +test_that("Step 3: Control arguments value", { + + # id + ict_sample$firm_id[1] <- NA + expect_error({ + qvar( + data = ict_sample, + id = "firm_id", dissemination_dummy = "dissemination", dissemination_weight = "w_sample", + sampling_weight = "w_sample", + define = TRUE + ) + }, regexp = "contain any missing \\(NA\\) values.") + rm(ict_sample) + ict_sample$firm_id[1] <- ict_sample$firm_id[2] + expect_error({ + qvar( + data = ict_sample, + id = "firm_id", dissemination_dummy = "dissemination", dissemination_weight = "w_sample", + sampling_weight = "w_sample", + define = TRUE + ) + }, regexp = "contain any duplicated values.") + rm(ict_sample) + + # dissemination_dummy + expect_error({ + qvar( + data = ict_sample, + id = "firm_id", dissemination_dummy = "division", dissemination_weight = "w_sample", + sampling_weight = "w_sample", + define = TRUE + ) + }, regexp = "should be of type logical or numeric.") + expect_error({ + suppressWarnings(qvar( + data = ict_sample, + id = "firm_id", dissemination_dummy = "dissemination", dissemination_weight = "w_sample", + sampling_weight = "w_sample", + define = TRUE + )) + }, regexp = NA) + ict_sample$dissemination[1] <- NA + expect_error({ + qvar( + data = ict_sample, + id = "firm_id", dissemination_dummy = "dissemination", dissemination_weight = "w_sample", + sampling_weight = "w_sample", + define = TRUE + ) + }, regexp = "should not contain any missing \\(NA\\) values.") + rm(ict_sample) + + # dissemination_weight + ict_sample$w_sample <- as.character(ict_sample$w_calib) + expect_error({ + qvar( + data = ict_sample, + id = "firm_id", dissemination_dummy = "dissemination", dissemination_weight = "w_sample", + sampling_weight = "w_sample", + define = TRUE + ) + }, regexp = "should be numeric.") + rm(ict_sample) + ict_sample$w_sample[1] <- NA + expect_error({ + qvar( + data = ict_sample, + id = "firm_id", dissemination_dummy = "dissemination", dissemination_weight = "w_sample", + sampling_weight = "w_sample", + define = TRUE + ) + }, regexp = "contain any missing \\(NA\\) values.") + rm(ict_sample) + + + # sampling_weight + ict_sample$w_sample <- as.character(ict_sample$w_sample) + expect_error({ + qvar( + data = ict_sample, + id = "firm_id", dissemination_dummy = "dissemination", dissemination_weight = "w_sample", + sampling_weight = "w_sample", + define = TRUE + ) + }, regexp = "should be numeric.") + rm(ict_sample) + ict_sample$w_sample[1] <- NA + expect_error({ + qvar( + data = ict_sample, + id = "firm_id", dissemination_dummy = "dissemination", dissemination_weight = "w_sample", + sampling_weight = "w_sample", + define = TRUE + ) + }, regexp = "contain any missing \\(NA\\) values.") + rm(ict_sample) + + # strata + ict_sample$strata <- suppressWarnings(as.numeric(ict_sample$strata)) + expect_error({ + qvar( + data = ict_sample, + id = "firm_id", dissemination_dummy = "dissemination", dissemination_weight = "w_sample", + sampling_weight = "w_sample", strata = "strata", + define = TRUE + ) + }, regexp = " should be of type factor or character.") + rm(ict_sample) + expect_error({ + suppressWarnings(qvar( + data = ict_sample, + id = "firm_id", dissemination_dummy = "dissemination", dissemination_weight = "w_sample", + sampling_weight = "w_sample", strata = "strata", + define = TRUE + )) + }, regexp = NA) + ict_sample$strata[1] <- NA + expect_error({ + suppressWarnings(qvar( + data = ict_sample, + id = "firm_id", dissemination_dummy = "dissemination", dissemination_weight = "w_sample", + sampling_weight = "w_sample", strata = "strata", + define = TRUE + )) + }, regexp = "should not contain any missing \\(NA\\) values.") + rm(ict_sample) + + # scope_dummy + expect_error({ + qvar( + data = ict_sample, + id = "firm_id", dissemination_dummy = "dissemination", dissemination_weight = "w_sample", + sampling_weight = "w_sample", strata = "strata", + scope_dummy = "division", + define = TRUE + ) + }, regexp = "should be of type logical or numeric.") + expect_error({ + suppressWarnings(qvar( + data = ict_sample, + id = "firm_id", dissemination_dummy = "dissemination", dissemination_weight = "w_sample", + sampling_weight = "w_sample", strata = "strata", + scope_dummy = "scope", + define = TRUE + )) + }, regexp = NA) + ict_sample$scope[1] <- NA + expect_error({ + qvar( + data = ict_sample, + id = "firm_id", dissemination_dummy = "dissemination", dissemination_weight = "w_sample", + sampling_weight = "w_sample", strata = "strata", + scope_dummy = "scope", + define = TRUE + ) + }, regexp = "should not contain any missing \\(NA\\) values.") + rm(ict_sample) + ict_sample$scope[match(TRUE, ict_sample$resp)] <- FALSE + expect_error({ + variance_wrapper <- qvar( + data = ict_sample, + id = "firm_id", dissemination_dummy = "dissemination", dissemination_weight = "w_nrc", + sampling_weight = "w_sample", strata = "strata", + scope_dummy = "scope", + nrc_weight = "w_nrc", response_dummy = "resp", nrc_dummy = "nrc", + define = TRUE + ) + }, regexp = "The following units are out-of-scope") + rm(ict_sample) + + # nrc_dummy + expect_error({ + qvar( + data = ict_sample, + id = "firm_id", dissemination_dummy = "dissemination", dissemination_weight = "w_nrc", + sampling_weight = "w_sample", strata = "strata", + scope_dummy = "scope", + nrc_weight = "w_nrc", response_dummy = "resp", nrc_dummy = "division", + define = TRUE + ) + }, regexp = "should be of type logical or numeric.") + expect_error({ + qvar( + data = ict_sample, + id = "firm_id", dissemination_dummy = "dissemination", dissemination_weight = "w_nrc", + sampling_weight = "w_sample", strata = "strata", + scope_dummy = "scope", + nrc_weight = "w_nrc", response_dummy = "resp", nrc_dummy = "nrc", + define = TRUE + ) + }, regexp = NA) + ict_sample$nrc[1] <- NA + expect_error({ + qvar( + data = ict_sample, + id = "firm_id", dissemination_dummy = "dissemination", dissemination_weight = "w_nrc", + sampling_weight = "w_sample", strata = "strata", + scope_dummy = "scope", + nrc_weight = "w_nrc", response_dummy = "resp", nrc_dummy = "nrc", + define = TRUE + ) + }, regexp = "should not contain any missing \\(NA\\) values.") + rm(ict_sample) + + # response_dummy + expect_error({ + qvar( + data = ict_sample, + id = "firm_id", dissemination_dummy = "dissemination", dissemination_weight = "w_nrc", + sampling_weight = "w_sample", strata = "strata", + scope_dummy = "scope", + nrc_weight = "w_nrc", response_dummy = "division", nrc_dummy = "nrc", + define = TRUE + ) + }, regexp = "should be of type logical or numeric.") + expect_error({ + suppressWarnings(qvar( + data = ict_sample, + id = "firm_id", dissemination_dummy = "dissemination", dissemination_weight = "w_nrc", + sampling_weight = "w_sample", strata = "strata", + scope_dummy = "scope", + nrc_weight = "w_nrc", response_dummy = "resp", nrc_dummy = "nrc", + define = TRUE + )) + }, regexp = NA) + ict_sample$resp[1] <- NA + expect_error({ + qvar( + data = ict_sample, + id = "firm_id", dissemination_dummy = "dissemination", dissemination_weight = "w_nrc", + sampling_weight = "w_sample", strata = "strata", + scope_dummy = "scope", + nrc_weight = "w_nrc", response_dummy = "resp", nrc_dummy = "nrc", + define = TRUE + ) + }, regexp = "should not contain any missing \\(NA\\) values.") + rm(ict_sample) + + # nrc_weight + ict_sample$w_nrc <- as.character(ict_sample$w_nrc) + expect_error({ + qvar( + data = ict_sample, + id = "firm_id", dissemination_dummy = "dissemination", dissemination_weight = "w_nrc", + sampling_weight = "w_sample", strata = "strata", + scope_dummy = "scope", + nrc_weight = "w_nrc", response_dummy = "resp", nrc_dummy = "nrc", + define = TRUE + ) + }, regexp = "should be numeric.") + rm(ict_sample) + ict_sample$w_nrc2 <- ict_sample$w_nrc + ict_sample$w_nrc2[match(TRUE, ict_sample$resp)] <- NA + expect_error({ + qvar( + data = ict_sample, + id = "firm_id", dissemination_dummy = "dissemination", dissemination_weight = "w_nrc", + sampling_weight = "w_sample", strata = "strata", + scope_dummy = "scope", + nrc_weight = "w_nrc2", response_dummy = "resp", nrc_dummy = "nrc", + define = TRUE + ) + }, regexp = "should not contain any missing \\(NA\\) values for responding units.") + rm(ict_sample) + ict_sample$w_nrc[match(FALSE, ict_sample$resp)] <- NA + expect_error({ + suppressWarnings(qvar( + data = ict_sample, + id = "firm_id", dissemination_dummy = "dissemination", dissemination_weight = "w_nrc", + sampling_weight = "w_sample", strata = "strata", + scope_dummy = "scope", + nrc_weight = "w_nrc", response_dummy = "resp", nrc_dummy = "nrc", + define = TRUE + )) + }, regexp = NA) + rm(ict_sample) + + # calibration_dummy + expect_error({ + qvar( + data = ict_sample, + id = "firm_id", dissemination_dummy = "dissemination", dissemination_weight = "w_calib", + sampling_weight = "w_sample", strata = "strata", + scope_dummy = "scope", + nrc_weight = "w_nrc", response_dummy = "resp", nrc_dummy = "nrc", + calibration_weight = "w_calib", calibration_dummy = "division", calibration_var = c("N_58", "N_59"), + define = TRUE + ) + }, regexp = "should be of type logical or numeric.") + ict_sample$calib <- NA + expect_error({ + qvar( + data = ict_sample, + id = "firm_id", dissemination_dummy = "dissemination", dissemination_weight = "w_calib", + sampling_weight = "w_sample", strata = "strata", + scope_dummy = "scope", + nrc_weight = "w_nrc", response_dummy = "resp", nrc_dummy = "nrc", + calibration_weight = "w_calib", calibration_dummy = "calib", calibration_var = c("N_58", "N_59"), + define = TRUE + ) + }, regexp = "should not contain any missing \\(NA\\) values.") + rm(ict_sample) + + # calibration_weight + ict_sample$w_calib <- as.character(ict_sample$w_calib) + expect_error({ + qvar( + data = ict_sample, + id = "firm_id", dissemination_dummy = "dissemination", dissemination_weight = "w_calib", + sampling_weight = "w_sample", strata = "strata", + scope_dummy = "scope", + nrc_weight = "w_nrc", response_dummy = "resp", nrc_dummy = "nrc", + calibration_weight = "w_calib", calibration_dummy = "calib", calibration_var = c("N_58", "N_59"), + define = TRUE + ) + }, regexp = "should be numeric.") + rm(ict_sample) + + # calibration_var + ict_sample$complex <- complex(real = 1:NROW(ict_sample), imaginary = 1:NROW(ict_sample)) + expect_error({ + qvar( + data = ict_sample, + id = "firm_id", dissemination_dummy = "dissemination", dissemination_weight = "w_calib", + sampling_weight = "w_sample", strata = "strata", + scope_dummy = "scope", + nrc_weight = "w_nrc", response_dummy = "resp", nrc_dummy = "nrc", + calibration_weight = "w_calib", calibration_var = "complex", + define = TRUE + ) + }, regexp = "The following calibration variables are neither quantitative") + rm(ict_sample) + ict_sample[match(TRUE, ict_sample$calib), c("N_58", "N_59")] <- NA + expect_error({ + qvar( + data = ict_sample, + id = "firm_id", dissemination_dummy = "dissemination", dissemination_weight = "w_calib", + sampling_weight = "w_sample", strata = "strata", + scope_dummy = "scope", + nrc_weight = "w_nrc", response_dummy = "resp", nrc_dummy = "nrc", + calibration_weight = "w_calib", calibration_var = c("N_58", "N_59"), + define = TRUE + ) + }, regexp = "contain missing \\(NA\\) values for units used in the calibration process:") + rm(ict_sample) + expect_error({ + variance_wrapper <- suppressWarnings(qvar( + data = ict_sample, + id = "firm_id", dissemination_dummy = "dissemination", dissemination_weight = "w_calib", + sampling_weight = "w_sample", strata = "strata", + scope_dummy = "scope", + nrc_weight = "w_nrc", response_dummy = "resp", nrc_dummy = "nrc", + calibration_weight = "w_calib", calibration_var = "division", + define = TRUE + )) + variance_wrapper(ict_survey, turnover) + }, regexp = NA) + +}) + +test_that("Step 4: Define methodological quantities", { + + expect_error({ + variance_wrapper <- qvar( + data = ict_sample, + id = "firm_id", dissemination_dummy = "dissemination", dissemination_weight = "w_nrc", + sampling_weight = "w_sample", strata = "strata", + scope_dummy = "scope", + nrc_weight = "w_nrc", response_dummy = "resp", nrc_dummy = "nrc", + calibration_weight = "w_calib", calibration_var = c("N_58", "N_59"), + define = TRUE + ) + + }, regexp = "The following units have a disseminated weight") + + ict_sample$dissemination[match(TRUE, ict_sample$resp)] <- FALSE + ict_sample$scope[match(TRUE, ict_sample$resp)] <- FALSE + expect_error({ + variance_wrapper <- qvar( + data = ict_sample, + id = "firm_id", dissemination_dummy = "dissemination", dissemination_weight = "w_calib", + sampling_weight = "w_sample", strata = "strata", + scope_dummy = "scope", + nrc_weight = "w_nrc", response_dummy = "resp", nrc_dummy = "nrc", + define = TRUE + ) + + }, regexp = "the following units are classified both as out-of-scope units") + rm(ict_sample) + + ict_sample$strata[1:26] <- letters + expect_warning({ + variance_wrapper <- qvar( + data = ict_sample, + id = "firm_id", dissemination_dummy = "dissemination", dissemination_weight = "w_calib", + sampling_weight = "w_sample", strata = "strata", + scope_dummy = "scope", + nrc_weight = "w_nrc", response_dummy = "resp", nrc_dummy = "nrc", + calibration_weight = "w_calib", calibration_var = c("N_58", "N_59"), + define = TRUE + ) + variance_wrapper(ict_survey, speed_quanti) + }, regexp = "The following strata contain less than two sampled units.") + rm(ict_sample) + + ict_sample$w_sample[1] <- ict_sample$w_sample[1] / 2 + expect_warning({ + variance_wrapper <- qvar( + data = ict_sample, + id = "firm_id", dissemination_dummy = "dissemination", dissemination_weight = "w_calib", + sampling_weight = "w_sample", strata = "strata", + scope_dummy = "scope", + nrc_weight = "w_nrc", response_dummy = "resp", nrc_dummy = "nrc", + calibration_weight = "w_calib", calibration_var = c("N_58", "N_59"), + define = TRUE + ) + variance_wrapper(ict_survey, speed_quanti) + }, regexp = "The following strata contain units whose sampling weights") + rm(ict_sample) + +}) + +# TODO: Add more tests with out-of-scope units + + +test_that("qvar works", { + expect_error( + qvar( + ict_sample, mean(turnover), + id = "firm_id", dissemination_dummy = "dissemination", dissemination_weight = "w_calib", + sampling_weight = "w_sample", strata = "strata", + scope_dummy = "scope", + nrc_weight = "w_nrc", response_dummy = "resp", nrc_dummy = "nrc", + calibration_weight = "w_calib", calibration_var = c("division", "turnover_58", "turnover_59"), + define = TRUE + ), + regexp = NA + ) + expect_error({ + qvar_wrapper <- suppressWarnings(qvar( + ict_sample, + id = "firm_id", dissemination_dummy = "dissemination", dissemination_weight = "w_calib", + sampling_weight = "w_sample", strata = "strata", + scope_dummy = "scope", + nrc_weight = "w_nrc", response_dummy = "resp", nrc_dummy = "nrc", + calibration_weight = "w_calib", calibration_var = c("division", "turnover_58", "turnover_59"), + define = TRUE + )) + qvar_wrapper(ict_survey, mean(turnover)) + }, regexp = NA) + expect_identical( + suppressWarnings(qvar( + ict_sample, mean(turnover), + id = "firm_id", dissemination_dummy = "dissemination", dissemination_weight = "w_calib", + sampling_weight = "w_sample", strata = "strata", + scope_dummy = "scope", + nrc_weight = "w_nrc", response_dummy = "resp", nrc_dummy = "nrc", + calibration_weight = "w_calib", calibration_var = c("division", "turnover_58", "turnover_59") + )), + qvar_wrapper(ict_survey, mean(turnover)) + ) +}) + diff --git a/tests/testthat/test_linearization_wrapper_standard.R b/tests/testthat/test_standard_statistic_wrapper.R similarity index 86% rename from tests/testthat/test_linearization_wrapper_standard.R rename to tests/testthat/test_standard_statistic_wrapper.R index e1bc962..a3a1531 100644 --- a/tests/testthat/test_linearization_wrapper_standard.R +++ b/tests/testthat/test_standard_statistic_wrapper.R @@ -1,19 +1,20 @@ - -context("linearization_wrapper") - -variance_wrapper <- define_variance_wrapper( - variance_function = function(y) abs(colSums(y)), - reference_id = ict_survey$firm_id, - default = list(id = "firm_id", weight = "w_calib", stat = "mean") -) - -test_that("standard linearization wrappers work", { - expect_error(variance_wrapper(ict_survey, mean(speed_quanti)), regexp = NA) - expect_error(variance_wrapper(ict_survey, mean(speed_quanti, by = division)), regexp = NA) - expect_error(variance_wrapper(ict_survey, ratio(speed_quanti, turnover)), regexp = NA) - expect_error(variance_wrapper(ict_survey, ratio(speed_quanti, turnover, by = division)), regexp = NA) - expect_error(variance_wrapper(ict_survey, diff_of_ratio(speed_quanti, turnover, speed_quanti, employees)), regexp = NA) - expect_error(variance_wrapper(ict_survey, diff_of_ratio(speed_quanti, turnover, speed_quanti, employees, by = division)), regexp = NA) - expect_error(variance_wrapper(ict_survey, ratio_of_ratio(speed_quanti, turnover, speed_quanti, employees)), regexp = NA) - expect_error(variance_wrapper(ict_survey, ratio_of_ratio(speed_quanti, turnover, speed_quanti, employees, by = division)), regexp = NA) + +context("standard_statistic_wrapper") + +variance_wrapper <- define_variance_wrapper( + variance_function = function(y) abs(colSums(y)), + reference_id = ict_survey$firm_id, + reference_weight = ict_survey$w_calib, + default_id = "firm_id" +) + +test_that("standard linearization wrappers work", { + expect_error(variance_wrapper(ict_survey, mean(speed_quanti)), regexp = NA) + expect_error(variance_wrapper(ict_survey, mean(speed_quanti, by = division)), regexp = NA) + expect_error(variance_wrapper(ict_survey, ratio(speed_quanti, turnover)), regexp = NA) + expect_error(variance_wrapper(ict_survey, ratio(speed_quanti, turnover, by = division)), regexp = NA) + expect_error(variance_wrapper(ict_survey, diff_of_ratio(speed_quanti, turnover, speed_quanti, employees)), regexp = NA) + expect_error(variance_wrapper(ict_survey, diff_of_ratio(speed_quanti, turnover, speed_quanti, employees, by = division)), regexp = NA) + expect_error(variance_wrapper(ict_survey, ratio_of_ratio(speed_quanti, turnover, speed_quanti, employees)), regexp = NA) + expect_error(variance_wrapper(ict_survey, ratio_of_ratio(speed_quanti, turnover, speed_quanti, employees, by = division)), regexp = NA) }) \ No newline at end of file diff --git a/tests/testthat/test_utils.R b/tests/testthat/test_utils.R index 6fcf9d3..c3aab6a 100644 --- a/tests/testthat/test_utils.R +++ b/tests/testthat/test_utils.R @@ -1,106 +1,180 @@ - - -context("utils") - - -# sumby() - -set.seed(1) -n <- 10 -v <- as.numeric(1:n) -v[6] <- NA -V <- sparseVector(v, seq_along(v), n) -m <- matrix(c(v, v), ncol = 2) -colnames(m) <- c("variable1", "variable2") -M <- Matrix(m) -df <- as.data.frame(m) -by <- sample(letters[1:3], 10, replace = TRUE) -w <- rep(2, n) -by_NA <- by -by_NA[c(2, 8)] <- NA - -test_that("sumby() works", { - - # Standard behaviour - r <- sapply(split(v, by), sum, na.rm = TRUE) - expect_equal(sumby(v, by), r) - expect_equal(sumby(V, by), r) - expect_equal(sumby(m, by), matrix(c(r, r), ncol = 2, dimnames = list(names(r), colnames(m)))) - expect_equal(sumby(M, by), matrix(c(r, r), ncol = 2, dimnames = list(names(r), colnames(m)))) - expect_equal(sumby(M, by, keep_sparse = TRUE), Matrix(c(r, r), ncol = 2, dimnames = list(names(r), colnames(m)))) - expect_equal(sumby(df, by), data.frame(variable1 = r, variable2 = r, row.names = names(r))) - expect_equal(sumby(v, by, w), r * 2) - - # Standard behaviour without removing the NA values - r <- sapply(split(v, by), sum) - expect_equal(sumby(v, by, na_rm = FALSE), r) - expect_equal(sumby(V, by, na_rm = FALSE), r) - expect_equal(sumby(m, by, na_rm = FALSE), matrix(c(r, r), ncol = 2, dimnames = list(names(r), colnames(m)))) - expect_equal(sumby(M, by, na_rm = FALSE), matrix(c(r, r), ncol = 2, dimnames = list(names(r), colnames(m)))) - expect_equal(sumby(M, by, na_rm = FALSE, keep_sparse = TRUE), Matrix(c(r, r), ncol = 2, dimnames = list(names(r), colnames(m)))) - expect_equal(sumby(df, by, na_rm = FALSE), data.frame(variable1 = r, variable2 = r, row.names = names(r))) - expect_equal(sumby(v, by, w, na_rm = FALSE), r * 2) - - # Standard behaviour with NA value in the by variable - r <- sapply(split(v, by_NA), sum, na.rm = TRUE) - expect_equal(sumby(v, by_NA), r) - expect_equal(sumby(V, by_NA), r) - expect_equal(sumby(m, by_NA), matrix(c(r, r), ncol = 2, dimnames = list(names(r), colnames(m)))) - expect_equal(sumby(M, by_NA), matrix(c(r, r), ncol = 2, dimnames = list(names(r), colnames(m)))) - expect_equal(sumby(M, by_NA, keep_sparse = TRUE), Matrix(c(r, r), ncol = 2, dimnames = list(names(r), colnames(m)))) - expect_equal(sumby(df, by_NA), data.frame(variable1 = r, variable2 = r, row.names = names(r))) - expect_equal(sumby(v, by_NA, w), r * 2) - - # Error messages - expect_error(sumby(letters[seq_along(by)], by), regexp = "y is not") - -}) - - -# add0() - -set.seed(1) -n <- 10 -p <- 2 -m <- matrix(1:(n*p), ncol = p, dimnames = list(sample(letters, n))) -m[c(3, 8, 12)] <- NA -M <- as(m, "TsparseMatrix") -rownames(M) <- rownames(m) -df <- as.data.frame(m) - -test_that("add0() works", { - - # Standard behaviour - expect_error(add0(m, letters), regexp = NA) - expect_error(add0(M, letters), regexp = NA) - expect_error(add0(df, letters), regexp = NA) - expect_error(add0(m, as.factor(letters)), regexp = NA) - - # Error and warning messages - expect_error(add0(m[, 1], letters), regexp = "y must be") - expect_error(add0(unname(m), letters), regexp = "y must have") - expect_error(add0(matrix(letters[m], ncol = 2, dimnames = list(rownames(m))), letters), regexp = "y is not numeric") - tmp <- m - rownames(tmp)[1:3] <- toupper(rownames(tmp)[1:3]) - expect_warning(add0(tmp, letters), regexp = "The name of some rows") - -}) - - -# assign_all() - -a <- 1 -fun <- function(){} -clos <- (function(){ - b <- 2 - function(){} -})() -e <- new.env(parent = globalenv()) -assign_all(c("a", "fun", "clos"), to = e) - -test_that("assign_all() works", { - expect_true( - all(sapply(e, function(x) if(is.function(x)) identical(parent.env(environment(x)), e) else TRUE)) - ) - expect_equal(ls(environment(e$clos)), "b") -}) + + +context("utils") + + +# coerce_to_TsparseMatrix +test_that("coerce_to_TsparseMatrix works as expected", { + y <- setNames(1:10, letters[1:10]) + M <- Matrix::sparseMatrix( + x = y, i = seq_along(y), j = rep(1, length(y)), + giveCsparse = FALSE, dimnames = list(names(y), NULL) + ) + expect_identical(coerce_to_TsparseMatrix(y), M) + m <- matrix(1:10, ncol = 2, dimnames = list(letters[1:5])) + M <- as(m, "TsparseMatrix") + dimnames(M) <- dimnames(m) + expect_identical(coerce_to_TsparseMatrix(m), M) + expect_identical(coerce_to_TsparseMatrix(M), M) +}) + + + +# make_block() +test_that("make_block works as expected", { + pik <- setNames(1 / ict_sample$w_sample, ict_sample$firm_id) + strata <- ict_sample$division + expect_error(make_block(pik, strata), regexp = NA) + expect_equal(rownames(make_block(pik, strata)), names(pik)) + expect_identical( + make_block(pik, strata), + make_block(matrix(pik, ncol = 1, dimnames = list(names(pik), NULL)), strata) + ) + pik <- matrix(pik, ncol = 1) + colnames(pik) <- "pik" + expect_error(validObject(make_block(pik, strata)), regexp = NA) +}) + + +# is_block_matrix +test_that("detect_block works as expected", { + pik <- 1 / ict_sample$w_sample + by <- ict_sample$division + expect_null(detect_block(pik, by)) + expect_null(detect_block(matrix(pik, ncol = 1), by)) + y <- make_block(pik, by) + expect_identical(detect_block(y, by), y) + o <- sample.int(NROW(y)) + y <- y[o, c(NCOL(y):1, 1)] + by <- by[o] + attr(y, "rowby") <- by + attr(y, "colby") <- c("63", "62", "61", "60", "59", "58", "58") + expect_identical(detect_block(y, by), y) +}) + + +# sum_by() + +set.seed(1) +n <- 10 +v <- as.numeric(1:n) +v[6] <- NA +V <- sparseVector(v, seq_along(v), n) +m <- matrix(c(v, v), ncol = 2) +colnames(m) <- c("variable1", "variable2") +M <- Matrix(m) +df <- as.data.frame(m) +by <- sample(letters[1:3], 10, replace = TRUE) +w <- rep(2, n) +by_NA <- by +by_NA[c(2, 8)] <- NA + +test_that("sum_by() works", { + + # Standard behaviour + r <- sapply(split(v, by), sum, na.rm = TRUE) + expect_equal(sum_by(v, by), r) + expect_equal(sum_by(V, by), r) + expect_equal(sum_by(m, by), matrix(c(r, r), ncol = 2, dimnames = list(names(r), colnames(m)))) + expect_equal(sum_by(M, by), matrix(c(r, r), ncol = 2, dimnames = list(names(r), colnames(m)))) + expect_equal(sum_by(M, by, keep_sparse = TRUE), Matrix(c(r, r), ncol = 2, dimnames = list(names(r), colnames(m)))) + expect_equal(sum_by(df, by), data.frame(variable1 = r, variable2 = r, row.names = names(r))) + expect_equal(sum_by(v, by, w), r * 2) + + # Standard behaviour without removing the NA values + r <- sapply(split(v, by), sum) + expect_equal(sum_by(v, by, na_rm = FALSE), r) + expect_equal(sum_by(V, by, na_rm = FALSE), r) + expect_equal(sum_by(m, by, na_rm = FALSE), matrix(c(r, r), ncol = 2, dimnames = list(names(r), colnames(m)))) + expect_equal(sum_by(M, by, na_rm = FALSE), matrix(c(r, r), ncol = 2, dimnames = list(names(r), colnames(m)))) + expect_equal(sum_by(M, by, na_rm = FALSE, keep_sparse = TRUE), Matrix(c(r, r), ncol = 2, dimnames = list(names(r), colnames(m)))) + expect_equal(sum_by(df, by, na_rm = FALSE), data.frame(variable1 = r, variable2 = r, row.names = names(r))) + expect_equal(sum_by(v, by, w, na_rm = FALSE), r * 2) + + # Standard behaviour with NA value in the by variable + r <- sapply(split(v, by_NA), sum, na.rm = TRUE) + expect_equal(sum_by(v, by_NA), r) + expect_equal(sum_by(V, by_NA), r) + expect_equal(sum_by(m, by_NA), matrix(c(r, r), ncol = 2, dimnames = list(names(r), colnames(m)))) + expect_equal(sum_by(M, by_NA), matrix(c(r, r), ncol = 2, dimnames = list(names(r), colnames(m)))) + expect_equal(sum_by(M, by_NA, keep_sparse = TRUE), Matrix(c(r, r), ncol = 2, dimnames = list(names(r), colnames(m)))) + expect_equal(sum_by(df, by_NA), data.frame(variable1 = r, variable2 = r, row.names = names(r))) + expect_equal(sum_by(v, by_NA, w), r * 2) + + # Error messages + expect_error(sum_by(letters[seq_along(by)], by), regexp = "y is not") + +}) + + +# add_zero() + +set.seed(1) +n <- 10 +p <- 2 +m <- matrix(1:(n*p), ncol = p, dimnames = list(sample(letters, n))) +m[c(3, 8, 12)] <- NA +M <- as(m, "TsparseMatrix") +rownames(M) <- rownames(m) +df <- as.data.frame(m) + +test_that("add_zero() works", { + + # Standard behaviour + expect_error(add_zero(m, letters), regexp = NA) + expect_error(add_zero(M, letters), regexp = NA) + expect_error(add_zero(df, letters), regexp = NA) + expect_error(add_zero(m, as.factor(letters)), regexp = NA) + + # Error and warning messages + expect_error(add_zero(m[, 1], letters), regexp = "y must be") + expect_error(add_zero(unname(m), letters), regexp = "y must have") + expect_error(add_zero(matrix(letters[m], ncol = 2, dimnames = list(rownames(m))), letters), regexp = "y is not numeric") + tmp <- m + rownames(tmp)[1:3] <- toupper(rownames(tmp)[1:3]) + expect_warning(add_zero(tmp, letters), regexp = "The name of some rows") + +}) + + +# assign_all() + +a <- 1 +fun <- function(){} +clos <- (function(){ + b <- 2 + function(){} +})() +e <- new.env(parent = globalenv()) +assign_all(c("a", "fun", "clos"), to = e) + +test_that("assign_all() works", { + expect_true( + all(sapply(e, function(x) if(is.function(x)) identical(parent.env(environment(x)), e) else TRUE)) + ) + expect_equal(ls(environment(e$clos)), "b") +}) + +# discretize_qualitative_var + +test_that("discretize_qualitative_var yield same results as stats::model.matrix()", { + expect_identical( + unname(as.matrix(discretize_qualitative_var(ict_sample$division))), + { + tmp <- stats::model.matrix(~ ict_sample$division - 1) + attributes(tmp) <- attributes(tmp)["dim"] + tmp + } + ) +}) +test_that("NA values are handled correctly", { + # NA in var => NA for all dummy variables + ict_sample$division[1] <- NA + expect_equal( + as.vector(discretize_qualitative_var(setNames(ict_sample$division, ict_sample$firm_id))[1, ]), + as.numeric(rep(NA, 6)) + ) + rm(ict_sample) +}) + + diff --git a/tests/testthat/test_variance_function.R b/tests/testthat/test_variance_function.R index 71765a6..0d0ac70 100644 --- a/tests/testthat/test_variance_function.R +++ b/tests/testthat/test_variance_function.R @@ -1,22 +1,96 @@ - -context("variance_function") - -N <- 1000 -n <- 10 -pikl <- matrix(rep(n*(n - 1) / (N * (N - 1)), n^2), ncol = n) -diag(pikl) <- rep(n/N, rep = n) -y <- rnorm(n) - -test_that("varDT works", { - precalcDT <- varDT(y = NULL, pik = diag(pikl)) - expect_error(varDT(y, pik = diag(pikl)), regexp = NA) - expect_error(varDT(y, precalc = precalcDT), regexp = NA) - expect_equal(varDT(y, pik = diag(pikl)), varDT(y, precalc = precalcDT)) -}) - -test_that("varSYG works", { - precalcSYG <- varSYG(y = NULL, pikl = pikl) - expect_error(varSYG(y, pikl = pikl), regexp = NA) - expect_error(varSYG(y = y, precalc = precalcSYG), regexp = NA) - expect_equal(varSYG(y, pikl = pikl), varSYG(y = y, precalc = precalcSYG)) -}) + +context("variance_function") + +# res_cal + + +test_that("res_cal works as expected", { + y <- ict_sample$employees + x <- ict_sample$turnover + expect_equal(res_cal(y, x), unname(lm(y ~ x - 1)$residuals)) + x <- make_block(x, ict_sample$division) + expect_equal(res_cal(y, x), unname(lm(y ~ as.matrix(x) - 1)$residuals)) + x <- x[, c(1:NCOL(x), 1)] + expect_message(res_cal(y, x), regexp = "Some variables in x were discarded due to collinearity.") + expect_equal(suppressWarnings(res_cal(y, x)), unname(lm(y ~ as.matrix(x) - 1)$residuals)) +}) + + + + + +# varDT and varSYG + +N <- 1000 +n <- 10 +pikl <- matrix(rep(n*(n - 1) / (N * (N - 1)), n^2), ncol = n) +diag(pikl) <- rep(n/N, rep = n) +y <- rnorm(n) + +test_that("varDT works", { + precalcDT <- varDT(y = NULL, pik = diag(pikl)) + expect_error(varDT(y, pik = diag(pikl)), regexp = NA) + expect_error(varDT(y, precalc = precalcDT), regexp = NA) + expect_equal(varDT(y, pik = diag(pikl)), varDT(y, precalc = precalcDT)) +}) + +test_that("varSYG works", { + precalcSYG <- varSYG(y = NULL, pikl = pikl) + expect_error(varSYG(y, pikl = pikl), regexp = NA) + expect_error(varSYG(y = y, precalc = precalcSYG), regexp = NA) + expect_equal(varSYG(y, pikl = pikl), varSYG(y = y, precalc = precalcSYG)) +}) + +test_that("varSYG and varDT yield the same results in the SRS case", { + expect_equal(varDT(y, pik = diag(pikl)), varSYG(y, pikl = pikl)) +}) + + +# varDT() +test_that("colinearity detection works", { + pik <- 1 / ict_sample$w_sample + strata <- ict_sample$division + y <- ict_sample$turnover + expect_message( + varDT(y = NULL, pik = pik, x = matrix(rep(pik, 2), ncol = 2), strata = strata), + regexp = "Some variables in x were discarded due to collinearity." + ) + expect_identical( + suppressWarnings(varDT(y = NULL, pik = pik, x = matrix(rep(pik, 2), ncol = 2), strata = strata)), + suppressWarnings(varDT(y = NULL, pik = pik, x = pik, strata = strata)) + ) + x_tmp <- make_block(matrix(rep(pik, 2), ncol = 2), strata)[, -c(2, 4)] + expect_identical( + suppressWarnings(varDT(y = NULL, pik = pik, x = x_tmp, strata = strata)), + suppressWarnings(varDT(y = NULL, pik = pik, x = pik, strata = strata)) + ) +}) +test_that("exhaustive units are handled correctly", { + pik <- 1 / ict_sample$w_sample + strata <- ict_sample$division + y <- ict_sample$turnover + pik[1:10] <- 1 + expect_message( + varDT(y = NULL, pik = pik, strata = strata), + regexp = "units are exhaustive \\(pik = 1\\). They are discarded from the variance estimation process." + ) + pik <- 1 / ict_sample$w_sample + pik[strata == "62"] <- 1 + expect_message( + varDT(y = NULL, pik = pik, strata = strata), + regexp = "units are exhaustive \\(pik = 1\\). They are discarded from the variance estimation process." + ) +}) +test_that("non-matching id raise an error", { + id <- ict_sample$firm_id + pik <- 1 / ict_sample$w_sample + strata <- ict_sample$division + precalc <- suppressWarnings(varDT(y = NULL, pik = pik, strata = strata, id = id)) + y <- setNames(ict_sample$turnover, id) + expect_error(varDT(y, precalc = precalc), regexp = NA) + expect_error( + varDT(y[rev(seq_along(y))], precalc = precalc), + regexp = "The names of the data matrix \\(y argument\\) do not match" + ) + +}) \ No newline at end of file