Skip to content

Commit

Permalink
version 1.4.0
Browse files Browse the repository at this point in the history
  • Loading branch information
Marcello D'"'"'Orazio authored and cran-robot committed Jul 16, 2020
1 parent d3767cf commit 16537bc
Show file tree
Hide file tree
Showing 23 changed files with 618 additions and 91 deletions.
10 changes: 5 additions & 5 deletions DESCRIPTION
@@ -1,14 +1,14 @@
Package: StatMatch
Version: 1.3.0
Version: 1.4.0
Title: Statistical Matching or Data Fusion
Author: Marcello D'Orazio
Maintainer: Marcello D'Orazio <mdo.statmatch@gmail.com>
Depends: R (>= 2.7.0), proxy, clue, survey, RANN, lpSolve
Suggests: MASS, Hmisc, mipfp, R.rsp
Depends: R (>= 2.7.0), proxy, survey, lpSolve, ggplot2
Suggests: Hmisc, MASS, mipfp, R.rsp, clue, RANN,
Description: Integration of two data sources referred to the same target population which share a number of variables. Some functions can also be used to impute missing values in data sets through hot deck imputation methods. Methods to perform statistical matching when dealing with data from complex sample surveys are available too.
License: GPL (>= 2)
VignetteBuilder: R.rsp
NeedsCompilation: no
Packaged: 2019-03-15 20:43:49 UTC; madorazi
Packaged: 2020-07-16 09:32:31 UTC; pc
Repository: CRAN
Date/Publication: 2019-03-15 21:13:55 UTC
Date/Publication: 2020-07-16 10:00:07 UTC
38 changes: 22 additions & 16 deletions MD5
@@ -1,46 +1,52 @@
8f1ff850aa68ba3201cb8b7502871c6c *DESCRIPTION
3b52596f6a8f3929b305b879271ed878 *NAMESPACE
df24ce059f969a49b91a51daa06d91db *NEWS
d5ec6782bb35dc30b4e41bc5733fc34f *DESCRIPTION
12d49887c20f3c8a3ec370b6775ea9c4 *NAMESPACE
e60a1e20f5e5722fafa79fbbad5b8f93 *NEWS
710dc79c118538c09a8e9089196c6d32 *R/Fbwidths.by.x.R
94321ae5a6c44c8334c5dffe232a11a2 *R/Frechet.bounds.cat.R
c2ef0952a666e5b028fdbf70635d79a7 *R/NND.hotdeck.R
94766e1e4858a3557662f9632203a4e4 *R/RANDwNND.hotdeck.R
bf4476495d1252fc72251035e7bc3aef *R/Frechet.bounds.cat.R
9e288f3cbf081439796a17c8a886cd0a *R/NND.hotdeck.R
15fca44df746a7a02f7cc111d312a309 *R/RANDwNND.hotdeck.R
7bfd1d76ab7e204a369e0ee8be055906 *R/SelMtc.by.unc.R
39b227871e1960df068a5e3539091e5b *R/comb.samples.R
1e26a60b90775c9598037cddb8bd7dbf *R/comp.prop.R
9396068abb28c862c4574879b36b1845 *R/create.fused.R
32386fc5fb7bb1da9c17d9bd836aee41 *R/fact2dummy.R
21104865f42be4b204ef013725b544e0 *R/fact2dummy.R
66b6ef00ad3a333e6c8b0eccec9155e8 *R/gower.dist.R
5256c8a2a230ab4efddb2704dd24a07d *R/harmonize.x.R
67588f85e4361c767744277e9a89c633 *R/mahalanobis.dist.R
01ad43b31e3f870348a8ec9e6be7a607 *R/maximum.dist.R
cccc63bd3a57274aa430a19de5beaa95 *R/mixed.mtc.R
e2b0c6bfdd406ba5433f57af0412fff4 *R/mixed.mtc.R
e67cbe52ae4305e1ea873fd6c7e0282a *R/pBayes.R
1f17e731e80cbf3481b570b3a84af12c *R/pw.assoc.R
674f43d2851c99d7811cbb6fc6cbc789 *R/rankNND_hotdeck.R
c9d0425b7d3df0b4a5b4231f7f46f964 *build/vignette.rds
c99014e20d38ec6cfafa47d248553eea *R/plotBounds.R
ccf53634dc11e591cabacac9038555ad *R/plotCont.R
136299227d21ada2ad5310e48fba705d *R/plotTab.R
c3396de6b0626042438d5d96d412604f *R/pw.assoc.R
8418569aa3cb1ace616fcfef64846a8c *R/rankNND_hotdeck.R
636a3920c4d7b49d2a49f2526093a138 *build/vignette.rds
e324fe9c04c91f84845bc71d9bddc05e *data/samp.A.rda
5e4f59e4955cab46754444b8c07ad227 *data/samp.B.rda
b643d3bc6da0098b5080972f85e65555 *data/samp.C.rda
ad2e4f5a66bba2e51da171f1dd6e96c9 *inst/doc/Statistical_Matching_with_StatMatch.pdf
8b7b1b76f7812dcbd2e868b330a7d9a3 *inst/doc/Statistical_Matching_with_StatMatch.pdf
b48ddaff48f1434f5dabf4b9f8e96c78 *inst/doc/Statistical_Matching_with_StatMatch.pdf.asis
e3f64b661ac940ae91b92f0463ed246f *man/Fbwidths.by.x.Rd
df8868eacac77e804dd838d209938fc6 *man/Frechet.bounds.cat.Rd
c22a82d8f82b0fc837007620ec77d42e *man/NND.hotdeck.Rd
5ad521f62ad6478fe440956bab3f7771 *man/RANDwNND.hotdeck.Rd
e5175e28f7a5a42b62d45abbc8ad36ab *man/NND.hotdeck.Rd
59ec8be2670209c0dd1787434e0499aa *man/RANDwNND.hotdeck.Rd
ce468a35db743537b0800f1c988e3faa *man/SelMtc.by.unc.Rd
81b5749ccbe727fb0b75ca88f5c1983e *man/StatMatch-package.Rd
93703b664976aae75ad5562542ebe41d *man/comb.samples.Rd
e391aca67e29a38c555acabd597e48bd *man/comp.prop.Rd
324c98b7039e764bb51acde2c2190bba *man/create.fused.Rd
71c6e7b22f95cc23057c93c4e0d5daa2 *man/fact2dummy.Rd
09bd9e4d843fe8ad92759c0c9598361f *man/fact2dummy.Rd
79d16558284af55c804765d2c3afca5b *man/gower.dist.Rd
8428ecc9760d98962966cd27051c6cbd *man/harmonize.x.Rd
95f96ddfa5cc52f234da8534e259dd60 *man/mahalanobis.dist.Rd
aaa0b0b8436088e3f0f6a8c82025159b *man/maximum.dist.Rd
00edaa49641fae6f12841cb6b178efb6 *man/mixed.mtc.Rd
61b191b7ade8f8a1a6c899f69f1c7d5a *man/pBayes.Rd
51a41af534674ee5063a03b2ab43bb0b *man/pw.assoc.Rd
af2c624ed26de2997bffa1008017a230 *man/plotBounds.Rd
58682098f267b3233ea194d84b71ab12 *man/plotCont.Rd
b4c7b37a008fc240718c47b9ba762727 *man/plotTab.Rd
e25d57f011c3191baae38bd2fdc4f884 *man/pw.assoc.Rd
be5de909fbeb3a0fbc9fcc523cc5b631 *man/rankNND.hotdeck.Rd
bbacd0409c0c449993ea8ea8845ba6be *man/samp.A.Rd
a9a225973c467c607083ecb5f5d4b36d *man/samp.B.Rd
Expand Down
17 changes: 10 additions & 7 deletions NAMESPACE
Expand Up @@ -4,16 +4,19 @@
exportPattern(".")

# Import all packages listed as Imports or Depends
#importFrom("lpSolve", lp.assign, lp.transport)

import(
proxy,
clue,
survey,
RANN,
lpSolve
ggplot2
)
importFrom("stats", "as.formula", "chisq.test", "coefficients", "cor",
"cov2cor", "lm", "mahalanobis", "model.frame",
"model.matrix", "qchisq", "residuals", "rnorm", "runif",
"cov2cor", "IQR","lm", "mahalanobis", "model.frame",
"model.matrix", "model.matrix.lm","quantile", "qchisq", "residuals", "rnorm", "runif",
"sd", "var", "weights", "xtabs", "ftable", "loglin")
importFrom("utils", "combn")
importFrom("utils", "combn", "globalVariables")
importFrom("graphics", "plot", "lines", "axis", "text", "par")
importFrom("lpSolve", lp.assign, lp.transport)
# importFrom("clue", "solve_LSAP")
# importFrom("RANN", "nn2")
# importFrom("Hmisc", "wtd.quantile")
26 changes: 25 additions & 1 deletion NEWS
@@ -1,3 +1,27 @@
1.4.0 Addedd functions for plotting results, changes to some code for better management of the NAs

NND.hotdeck and RANDwNND.hotdeck NO more trasform the categorical matching variables in dummies
when the chosen distance function is defined only for numerical variables; in practice, mixed-type matching variables
can only be used with the Gower's distance

fact2dummy: when a NA is observed for a categorical variable then the function puts NAs in all the dummy
variables generated from it

pw.assoc discards NAs before calculation of the associaione or PRE measures; removal follows the pairwise
deletion rule (units where one of both the values are missing are discarded)

plotTab is a NEW function for comparing the marginal distributions of the same categorica variable(s) but estimated
from two different data sources

plotCont is a NEW function for comparing the marginal distributions of the same numerical variable but
estimated from two different data sources

plotBounds is a NEW function providing a graphical summary of the width of the Frechet Bounds estimated with
the Frechet.bounds.cat function


#########################################################################################################################

1.3.0 changes in the functions related to uncertainty investigation when dealing with categorical variables

Frechet.bounds.cat now permits to align marginal distributions of X variables via IPF algorithm
Expand All @@ -10,7 +34,7 @@
uncertainty estimate, as in D'Orazio, Di Zio, Scanu 2017 paper (see ref in help pages)

Updates in pw.assoc() to allow computation of bias corrected Cramer's V, mutual information (also
normalized), AIC and BIC. Results can be organized in a data.frame. Changes in the dicumentation layout
normalized), AIC and BIC. Results can be organized in a data.frame. Changes in the documentation layout
to achieve coherence with documentation of other functions in the package

Please note that Vignette is frozen to StatMatch 1.2.5, therefore it will not provide new feauter related to investigation
Expand Down
10 changes: 4 additions & 6 deletions R/Frechet.bounds.cat.R
Expand Up @@ -22,10 +22,9 @@ Frechet.bounds.cat <- function (tab.x, tab.xy, tab.xz,
dimnames(low) <- dimnames(upper) <- dimnames(ind) <- list(lab.y,
lab.z)
class(low) <- class(upper) <- class(ind) <- "table"
H.y <- sum(-1 * p.y * log(p.y), na.rm = TRUE)
H.z <- sum(-1 * p.z * log(p.z), na.rm = TRUE)
res.0 <- list(low.u = low, up.u = upper, IA = ind, H = c(H.y,
H.z), uncertainty = mean(upper - low))
# H.y <- sum(-1 * p.y * log(p.y), na.rm = TRUE)
# H.z <- sum(-1 * p.z * log(p.z), na.rm = TRUE)
res.0 <- list(low.u = low, up.u = upper, IA = ind, uncertainty = mean(upper - low))
if (prn == "tables") {
out <- res.0
}
Expand All @@ -34,8 +33,7 @@ Frechet.bounds.cat <- function (tab.x, tab.xy, tab.xz,
colnames(df) <- c("Y", "Z", "low.u")
df$IA <- c(ind)
df$up.u <- c(upper)
out <- list(bounds = df, H = c(H.y, H.z), uncertainty = mean(upper -
low))
out <- list(bounds = df, uncertainty = mean(upper - low))
}
out
}
Expand Down
47 changes: 24 additions & 23 deletions R/NND.hotdeck.R
Expand Up @@ -2,6 +2,7 @@ NND.hotdeck <- function (data.rec, data.don, match.vars, don.class = NULL,
dist.fun = "Manhattan", constrained = FALSE, constr.alg = "Hungarian",
k=1, keep.t = FALSE, ...)
{
# initial checks on the input arguments
p <- length(match.vars)
if (!is.null(dim(data.rec))) {
nr <- nrow(data.rec)
Expand All @@ -19,6 +20,7 @@ NND.hotdeck <- function (data.rec, data.don, match.vars, don.class = NULL,
nd <- length(data.don)
d.lab <- names(data.don)
}
# labels of the observations
if (is.null(r.lab))
r.lab <- paste("rec", 1:nr, sep = "=")
else r.lab <- paste("rec", r.lab, sep = "=")
Expand All @@ -27,24 +29,27 @@ NND.hotdeck <- function (data.rec, data.don, match.vars, don.class = NULL,
d.lab <- paste("don", 1:nd, sep = "=")
else d.lab <- paste("don", d.lab, sep = "=")
row.names(data.don) <- d.lab
# check the coherence between the type of matching variables and the distance
if (!is.null(match.vars)) {
if (dist.fun == "Euclidean" || dist.fun == "euclidean" ||
dist.fun == "Manhattan" || dist.fun == "Mahalanobis" ||
dist.fun == "mahalanobis" || dist.fun == "manhattan" ||
dist.fun == "minimax" || dist.fun == "MiniMax" ||
dist.fun == "Minimax") {
cat("Warning: The ", dist.fun, " distance is being used",
fill = TRUE)
cat("All the categorical matching variables in rec and don \n data.frames, if present are recoded into dummies",
fill = TRUE)
}

if (
(tolower(dist.fun) %in% c("euclidean", "manhattan",
"mahalanobis", "minimax"))
& (!all(sapply(data.rec[,match.vars], is.numeric)))
)
stop("The chosen distance function requires numeric matching variables \n
with mixed-type matching variables please use the Gower's distance")
if (dist.fun == "exact" || dist.fun == "exact matching") {
cat("Warning: the exact matching distance is being used",
fill = TRUE)
cat("all the matching variables in rec and don are converted \n to character variables and are treated as categorical nominal",
fill = TRUE)
}
}
###### END of initial checks
#####################################
# 'ghost' function used at the core of NND
#
NND.hd <- function(rec, don, dfun = "Manhattan", constr = FALSE,
c.alg = NULL, ...) {
x.rec <- rec
Expand All @@ -63,27 +68,15 @@ NND.hotdeck <- function (data.rec, data.don, match.vars, don.class = NULL,
d.lab <- paste("don", 1:nr, sep = "=")
if (dfun == "Euclidean" || dfun == "euclidean" || dfun ==
"Manhattan" || dfun == "manhattan") {
if (is.data.frame(x.rec))
x.rec <- fact2dummy(x.rec, all = TRUE)
if (is.data.frame(x.don))
x.don <- fact2dummy(x.don, all = TRUE)
mdist <- proxy::dist(x = x.rec, y = x.don, method = dfun,
...)
}
else if (dfun == "Mahalanobis" || dfun == "mahalanobis") {
if (is.data.frame(x.rec))
x.rec <- fact2dummy(x.rec, all = TRUE)
if (is.data.frame(x.don))
x.don <- fact2dummy(x.don, all = TRUE)
mdist <- mahalanobis.dist(data.x = x.rec, data.y = x.don,
...)
}
else if (dfun == "minimax" || dfun == "MiniMax" || dfun ==
"Minimax") {
if (is.data.frame(x.rec))
x.rec <- fact2dummy(x.rec, all = TRUE)
if (is.data.frame(x.don))
x.don <- fact2dummy(x.don, all = TRUE)
mdist <- maximum.dist(data.x = x.rec, data.y = x.don,
...)
}
Expand All @@ -106,6 +99,12 @@ NND.hotdeck <- function (data.rec, data.don, match.vars, don.class = NULL,
mdist[is.nan(mdist)] <- 1
mdist[is.na(mdist)] <- 1
}
# else if (tolower(dfun) == "modgower") {
# mdist <- gower.dist.mod(data.x = x.rec, data.y = x.don,
# ...)
# mdist[is.nan(mdist)] <- 1
# mdist[is.na(mdist)] <- 1
# }
else {
mdist <- proxy::dist(x = x.rec, y = x.don, method = dfun,
...)
Expand Down Expand Up @@ -172,7 +171,7 @@ NND.hotdeck <- function (data.rec, data.don, match.vars, don.class = NULL,
if (constr && (c.alg == "Hungarian" || c.alg == "hungarian")) {
if (nr > nd)
stop("When using the Hungarian algorithm \n the no. of donors must be greater \n or equal than the no. of recipients")
sol <- solve_LSAP(x = mdist, maximum = FALSE)
sol <- clue::solve_LSAP(x = mdist, maximum = FALSE)
rec.lab <- r.lab
don.lab <- d.lab[as.integer(sol)]
dist.rd <- mdist[cbind(rec.lab, don.lab)]
Expand All @@ -185,6 +184,8 @@ NND.hotdeck <- function (data.rec, data.don, match.vars, don.class = NULL,
noad = nad, call = match.call())
fine
}
##### END of the 'ghost' function NND.hd()
##################################################
if (is.null(don.class)) {
out <- NND.hd(rec = data.rec[, match.vars, drop = FALSE],
don = data.don[, match.vars, drop = FALSE], dfun = dist.fun,
Expand Down
21 changes: 11 additions & 10 deletions R/RANDwNND.hotdeck.R
Expand Up @@ -27,10 +27,14 @@ function (data.rec, data.don, match.vars=NULL, don.class=NULL, dist.fun="Manhatt

p <- length(match.vars)
if(!is.null(match.vars)){
if(dist.fun=="Euclidean" || dist.fun=="euclidean" || dist.fun=="Manhattan" || dist.fun=="manhattan" || dist.fun=="Mahalanobis" || dist.fun=="mahalanobis" || dist.fun=="minimax" || dist.fun=="MiniMax" || dist.fun=="Minimax"){
cat("Warning: The ", dist.fun, " distance is being used", fill=TRUE)
cat("All the categorical matching variables in rec and don data.frames, \n if present, are recoded into dummies", fill=TRUE)
}
if (
(tolower(dist.fun) %in% c("euclidean", "manhattan",
"mahalanobis", "minimax"))
& (!all(sapply(data.rec[,match.vars], is.numeric)))
)
stop("The chosen distance function requires numeric matching variables, \n
with mixed-type matching variables please use Gower's distance")

if(dist.fun=="exact" || dist.fun=="exact matching"){
cat("Warning: the exact matching distance is being used", fill=TRUE)
cat("all the matching variables in rec and don are converted to \n character variables and are treated as categorical nominal", fill=TRUE)
Expand Down Expand Up @@ -78,20 +82,17 @@ RANDwNND.hd <- function (rec, don, dfun="Manhattan", cut.don="rot", k=NULL, w.do

if(dfun=="Euclidean" || dfun=="Manhattan"){
# require(proxy)
x.rec <- fact2dummy(x.rec, all=TRUE)
x.don <- fact2dummy(x.don, all=TRUE)

mdist <- proxy::dist(x=x.rec, y=x.don, method=dfun, ...)
dimnames(mdist) <- list(r.lab, d.lab)
}
else if(dfun=="Mahalanobis" || dfun=="mahalanobis"){
if(is.data.frame(x.rec)) x.rec <- fact2dummy(x.rec, all=TRUE)
if(is.data.frame(x.don)) x.don <- fact2dummy(x.don, all=TRUE)

mdist <- mahalanobis.dist(data.x=x.rec, data.y=x.don, ...)
dimnames(mdist) <- list(r.lab, d.lab)
}
else if(dfun=="minimax" || dfun=="MiniMax" || dfun=="Minimax"){
x.rec <- fact2dummy(x.rec, all=TRUE)
x.don <- fact2dummy(x.don, all=TRUE)

mdist <- maximum.dist(data.x=x.rec, data.y=x.don, ...)
dimnames(mdist) <- list(r.lab, d.lab)
}
Expand Down
8 changes: 6 additions & 2 deletions R/fact2dummy.R
@@ -1,12 +1,14 @@
`fact2dummy` <-
function (data, all=TRUE, lab="x")
{
#########################################
dum.fcn <- function(x, all=TRUE){
fine <- model.matrix(~x-1)
fine <- model.matrix.lm(~x-1, na.action = "na.pass")
colnames(fine) <- levels(x)
if(!all) fine <- fine[,-ncol(fine), drop=FALSE]
fine
}
}
###########################################

if(is.null(dim(data))){
if(class(data)[1]=="numeric" || class(data)[1]=="integer" || class(data)[1]=="logical") oo <- cbind(1*data)
Expand Down Expand Up @@ -40,10 +42,12 @@ function (data, all=TRUE, lab="x")
# oo <- unlist(out)
# oo <- matrix(oo, nrow=n)
# dimnames(oo) <- list(row.names(data), unlist(lapply(out, colnames)))
#cat(unlist(lapply(out, nrow)), fill=T)
oo <- do.call("cbind", out)
}
}
rownames(oo) <- rownames(data)
oo

}

4 changes: 2 additions & 2 deletions R/mixed.mtc.R
Expand Up @@ -218,7 +218,7 @@ function (data.rec, data.don, match.vars, y.rec, z.don, method="ML", rho.yz=NULL
r.rhs <- rep(1, nA)
c.sig <- rep("<=", nB)
c.rhs <- rep(1, nB)
appo <- lp.transport(cost.mat=madist, row.signs=r.sig, row.rhs=r.rhs, col.signs=c.sig, col.rhs=c.rhs)
appo <- lpSolve::lp.transport(cost.mat=madist, row.signs=r.sig, row.rhs=r.rhs, col.signs=c.sig, col.rhs=c.rhs)
}
sol <- appo$solution
ss <- c(t(sol))
Expand All @@ -231,7 +231,7 @@ function (data.rec, data.don, match.vars, y.rec, z.don, method="ML", rho.yz=NULL
else if(constr.alg=="hungarian" || constr.alg=="Hungarian"){
if(nA > nB) stop("It is required that the no. of donors \n
is equal or greater than the no. of recipients")
sol <- solve_LSAP(x=madist, maximum=FALSE)
sol <- clue::solve_LSAP(x=madist, maximum=FALSE)
don.lab <- B.lab[as.integer(sol)]
dist.rd <- madist[cbind(A.lab, don.lab)]
}
Expand Down

0 comments on commit 16537bc

Please sign in to comment.