version 1.4.0

cran · Jul 16, 2020 · 16537bc · 16537bc
1 parent d3767cf
commit 16537bc
Show file tree

Hide file tree

Showing 23 changed files with 618 additions and 91 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,14 +1,14 @@
 Package: StatMatch
-Version: 1.3.0
+Version: 1.4.0
 Title: Statistical Matching or Data Fusion
 Author: Marcello D'Orazio
 Maintainer: Marcello D'Orazio <mdo.statmatch@gmail.com>
-Depends: R (>= 2.7.0), proxy, clue, survey, RANN, lpSolve
-Suggests: MASS, Hmisc, mipfp, R.rsp
+Depends: R (>= 2.7.0), proxy, survey, lpSolve, ggplot2
+Suggests: Hmisc, MASS, mipfp, R.rsp, clue, RANN,
 Description: Integration of two data sources referred to the same target population which share a number of variables. Some functions can also be used to impute missing values in data sets through hot deck imputation methods. Methods to perform statistical matching when dealing  with data from complex sample surveys are available too.
 License: GPL (>= 2)
 VignetteBuilder: R.rsp
 NeedsCompilation: no
-Packaged: 2019-03-15 20:43:49 UTC; madorazi
+Packaged: 2020-07-16 09:32:31 UTC; pc
 Repository: CRAN
-Date/Publication: 2019-03-15 21:13:55 UTC
+Date/Publication: 2020-07-16 10:00:07 UTC
diff --git a/MD5 b/MD5
@@ -1,46 +1,52 @@
-8f1ff850aa68ba3201cb8b7502871c6c *DESCRIPTION
-3b52596f6a8f3929b305b879271ed878 *NAMESPACE
-df24ce059f969a49b91a51daa06d91db *NEWS
+d5ec6782bb35dc30b4e41bc5733fc34f *DESCRIPTION
+12d49887c20f3c8a3ec370b6775ea9c4 *NAMESPACE
+e60a1e20f5e5722fafa79fbbad5b8f93 *NEWS
 710dc79c118538c09a8e9089196c6d32 *R/Fbwidths.by.x.R
-94321ae5a6c44c8334c5dffe232a11a2 *R/Frechet.bounds.cat.R
-c2ef0952a666e5b028fdbf70635d79a7 *R/NND.hotdeck.R
-94766e1e4858a3557662f9632203a4e4 *R/RANDwNND.hotdeck.R
+bf4476495d1252fc72251035e7bc3aef *R/Frechet.bounds.cat.R
+9e288f3cbf081439796a17c8a886cd0a *R/NND.hotdeck.R
+15fca44df746a7a02f7cc111d312a309 *R/RANDwNND.hotdeck.R
 7bfd1d76ab7e204a369e0ee8be055906 *R/SelMtc.by.unc.R
 39b227871e1960df068a5e3539091e5b *R/comb.samples.R
 1e26a60b90775c9598037cddb8bd7dbf *R/comp.prop.R
 9396068abb28c862c4574879b36b1845 *R/create.fused.R
-32386fc5fb7bb1da9c17d9bd836aee41 *R/fact2dummy.R
+21104865f42be4b204ef013725b544e0 *R/fact2dummy.R
 66b6ef00ad3a333e6c8b0eccec9155e8 *R/gower.dist.R
 5256c8a2a230ab4efddb2704dd24a07d *R/harmonize.x.R
 67588f85e4361c767744277e9a89c633 *R/mahalanobis.dist.R
 01ad43b31e3f870348a8ec9e6be7a607 *R/maximum.dist.R
-cccc63bd3a57274aa430a19de5beaa95 *R/mixed.mtc.R
+e2b0c6bfdd406ba5433f57af0412fff4 *R/mixed.mtc.R
 e67cbe52ae4305e1ea873fd6c7e0282a *R/pBayes.R
-1f17e731e80cbf3481b570b3a84af12c *R/pw.assoc.R
-674f43d2851c99d7811cbb6fc6cbc789 *R/rankNND_hotdeck.R
-c9d0425b7d3df0b4a5b4231f7f46f964 *build/vignette.rds
+c99014e20d38ec6cfafa47d248553eea *R/plotBounds.R
+ccf53634dc11e591cabacac9038555ad *R/plotCont.R
+136299227d21ada2ad5310e48fba705d *R/plotTab.R
+c3396de6b0626042438d5d96d412604f *R/pw.assoc.R
+8418569aa3cb1ace616fcfef64846a8c *R/rankNND_hotdeck.R
+636a3920c4d7b49d2a49f2526093a138 *build/vignette.rds
 e324fe9c04c91f84845bc71d9bddc05e *data/samp.A.rda
 5e4f59e4955cab46754444b8c07ad227 *data/samp.B.rda
 b643d3bc6da0098b5080972f85e65555 *data/samp.C.rda
-ad2e4f5a66bba2e51da171f1dd6e96c9 *inst/doc/Statistical_Matching_with_StatMatch.pdf
+8b7b1b76f7812dcbd2e868b330a7d9a3 *inst/doc/Statistical_Matching_with_StatMatch.pdf
 b48ddaff48f1434f5dabf4b9f8e96c78 *inst/doc/Statistical_Matching_with_StatMatch.pdf.asis
 e3f64b661ac940ae91b92f0463ed246f *man/Fbwidths.by.x.Rd
 df8868eacac77e804dd838d209938fc6 *man/Frechet.bounds.cat.Rd
-c22a82d8f82b0fc837007620ec77d42e *man/NND.hotdeck.Rd
-5ad521f62ad6478fe440956bab3f7771 *man/RANDwNND.hotdeck.Rd
+e5175e28f7a5a42b62d45abbc8ad36ab *man/NND.hotdeck.Rd
+59ec8be2670209c0dd1787434e0499aa *man/RANDwNND.hotdeck.Rd
 ce468a35db743537b0800f1c988e3faa *man/SelMtc.by.unc.Rd
 81b5749ccbe727fb0b75ca88f5c1983e *man/StatMatch-package.Rd
 93703b664976aae75ad5562542ebe41d *man/comb.samples.Rd
 e391aca67e29a38c555acabd597e48bd *man/comp.prop.Rd
 324c98b7039e764bb51acde2c2190bba *man/create.fused.Rd
-71c6e7b22f95cc23057c93c4e0d5daa2 *man/fact2dummy.Rd
+09bd9e4d843fe8ad92759c0c9598361f *man/fact2dummy.Rd
 79d16558284af55c804765d2c3afca5b *man/gower.dist.Rd
 8428ecc9760d98962966cd27051c6cbd *man/harmonize.x.Rd
 95f96ddfa5cc52f234da8534e259dd60 *man/mahalanobis.dist.Rd
 aaa0b0b8436088e3f0f6a8c82025159b *man/maximum.dist.Rd
 00edaa49641fae6f12841cb6b178efb6 *man/mixed.mtc.Rd
 61b191b7ade8f8a1a6c899f69f1c7d5a *man/pBayes.Rd
-51a41af534674ee5063a03b2ab43bb0b *man/pw.assoc.Rd
+af2c624ed26de2997bffa1008017a230 *man/plotBounds.Rd
+58682098f267b3233ea194d84b71ab12 *man/plotCont.Rd
+b4c7b37a008fc240718c47b9ba762727 *man/plotTab.Rd
+e25d57f011c3191baae38bd2fdc4f884 *man/pw.assoc.Rd
 be5de909fbeb3a0fbc9fcc523cc5b631 *man/rankNND.hotdeck.Rd
 bbacd0409c0c449993ea8ea8845ba6be *man/samp.A.Rd
 a9a225973c467c607083ecb5f5d4b36d *man/samp.B.Rd

diff --git a/NAMESPACE b/NAMESPACE
@@ -4,16 +4,19 @@
 exportPattern(".")
 
 # Import all packages listed as Imports or Depends
-#importFrom("lpSolve", lp.assign, lp.transport)
+
 import(
   proxy,
-  clue,
   survey,
-  RANN,
-  lpSolve
+  ggplot2
 )
  importFrom("stats", "as.formula", "chisq.test", "coefficients", "cor",
-     "cov2cor", "lm", "mahalanobis", "model.frame",
-     "model.matrix", "qchisq", "residuals", "rnorm", "runif",
+     "cov2cor", "IQR","lm", "mahalanobis", "model.frame",
+     "model.matrix", "model.matrix.lm","quantile", "qchisq", "residuals", "rnorm", "runif",
      "sd", "var", "weights", "xtabs", "ftable", "loglin")
- importFrom("utils", "combn")
+ importFrom("utils", "combn", "globalVariables")
+ importFrom("graphics", "plot", "lines", "axis", "text", "par")
+ importFrom("lpSolve", lp.assign, lp.transport) 
+# importFrom("clue", "solve_LSAP")
+# importFrom("RANN", "nn2")
+# importFrom("Hmisc", "wtd.quantile")
diff --git a/NEWS b/NEWS
@@ -1,3 +1,27 @@
+1.4.0 Addedd functions for plotting results, changes to some code for better management of the NAs
+
+		NND.hotdeck and RANDwNND.hotdeck NO more trasform the categorical matching variables in dummies
+		when the chosen distance function is defined only for numerical variables; in practice, mixed-type matching variables
+		can only be used with the Gower's distance
+
+		fact2dummy: when a NA is observed for a categorical variable then the function puts NAs in all the dummy 
+                variables generated from it
+
+		pw.assoc discards NAs before calculation of the associaione or PRE measures; removal follows the pairwise 
+		deletion rule (units where one of both the values are missing are discarded)
+
+		plotTab is a NEW function for comparing the marginal distributions of the same categorica variable(s) but estimated 
+		from two different data sources
+
+		plotCont is a NEW function for comparing the marginal distributions of the same numerical variable but 
+		estimated from two different data sources
+
+		plotBounds is a NEW function providing a graphical summary of the width of the Frechet Bounds estimated with 
+		the Frechet.bounds.cat function
+
+
+#########################################################################################################################
+
 1.3.0 changes in the functions related to uncertainty investigation when dealing with categorical variables
 
 		Frechet.bounds.cat now permits to align marginal distributions of X variables via IPF algorithm
@@ -10,7 +34,7 @@
 		uncertainty estimate, as in D'Orazio, Di Zio, Scanu 2017 paper (see ref in help pages)
 
 		Updates in pw.assoc() to allow computation of bias corrected Cramer's V, mutual information (also 
-		normalized), AIC and BIC. Results can be organized in a data.frame. Changes in the dicumentation layout
+		normalized), AIC and BIC. Results can be organized in a data.frame. Changes in the documentation layout
 		to achieve coherence with documentation of other functions in the package
 
 		Please note that Vignette is frozen to StatMatch 1.2.5, therefore it will not provide new feauter related to investigation

diff --git a/R/Frechet.bounds.cat.R b/R/Frechet.bounds.cat.R
@@ -22,10 +22,9 @@ Frechet.bounds.cat <- function (tab.x, tab.xy, tab.xz,
         dimnames(low) <- dimnames(upper) <- dimnames(ind) <- list(lab.y, 
                                                                   lab.z)
         class(low) <- class(upper) <- class(ind) <- "table"
-        H.y <- sum(-1 * p.y * log(p.y), na.rm = TRUE)
-        H.z <- sum(-1 * p.z * log(p.z), na.rm = TRUE)
-        res.0 <- list(low.u = low, up.u = upper, IA = ind, H = c(H.y, 
-                                                                 H.z), uncertainty = mean(upper - low))
+        # H.y <- sum(-1 * p.y * log(p.y), na.rm = TRUE)
+        # H.z <- sum(-1 * p.z * log(p.z), na.rm = TRUE)
+        res.0 <- list(low.u = low, up.u = upper, IA = ind, uncertainty = mean(upper - low))
         if (prn == "tables") {
             out <- res.0
         }
@@ -34,8 +33,7 @@ Frechet.bounds.cat <- function (tab.x, tab.xy, tab.xz,
             colnames(df) <- c("Y", "Z", "low.u")
             df$IA <- c(ind)
             df$up.u <- c(upper)
-            out <- list(bounds = df, H = c(H.y, H.z), uncertainty = mean(upper - 
-                                                                             low))
+            out <- list(bounds = df, uncertainty = mean(upper - low))
         }
         out
     }

diff --git a/R/NND.hotdeck.R b/R/NND.hotdeck.R
@@ -2,6 +2,7 @@ NND.hotdeck <- function (data.rec, data.don, match.vars, don.class = NULL,
           dist.fun = "Manhattan", constrained = FALSE, constr.alg = "Hungarian", 
           k=1, keep.t = FALSE, ...) 
 {
+# initial checks on the input arguments    
     p <- length(match.vars)
     if (!is.null(dim(data.rec))) {
         nr <- nrow(data.rec)
@@ -19,6 +20,7 @@ NND.hotdeck <- function (data.rec, data.don, match.vars, don.class = NULL,
         nd <- length(data.don)
         d.lab <- names(data.don)
     }
+# labels of the observations    
     if (is.null(r.lab)) 
         r.lab <- paste("rec", 1:nr, sep = "=")
     else r.lab <- paste("rec", r.lab, sep = "=")
@@ -27,24 +29,27 @@ NND.hotdeck <- function (data.rec, data.don, match.vars, don.class = NULL,
         d.lab <- paste("don", 1:nd, sep = "=")
     else d.lab <- paste("don", d.lab, sep = "=")
     row.names(data.don) <- d.lab
+# check the coherence between the type of matching variables and the distance
     if (!is.null(match.vars)) {
-        if (dist.fun == "Euclidean" || dist.fun == "euclidean" || 
-            dist.fun == "Manhattan" || dist.fun == "Mahalanobis" || 
-            dist.fun == "mahalanobis" || dist.fun == "manhattan" || 
-            dist.fun == "minimax" || dist.fun == "MiniMax" || 
-            dist.fun == "Minimax") {
-            cat("Warning: The ", dist.fun, " distance is being used", 
-                fill = TRUE)
-            cat("All the categorical matching variables in rec and don \n data.frames, if present are recoded into dummies", 
-                fill = TRUE)
-        }
+
+        if (
+            (tolower(dist.fun) %in% c("euclidean", "manhattan", 
+                                      "mahalanobis", "minimax"))
+            & (!all(sapply(data.rec[,match.vars], is.numeric)))
+        )
+        stop("The chosen distance function requires numeric matching variables \n
+             with mixed-type matching variables please use the Gower's distance")
         if (dist.fun == "exact" || dist.fun == "exact matching") {
             cat("Warning: the exact matching distance is being used", 
                 fill = TRUE)
             cat("all the matching variables in rec and don are converted \n to character variables and are treated as categorical nominal", 
                 fill = TRUE)
         }
     }
+###### END of initial checks
+#####################################
+# 'ghost' function used at the core of NND
+#    
     NND.hd <- function(rec, don, dfun = "Manhattan", constr = FALSE, 
                        c.alg = NULL, ...) {
         x.rec <- rec
@@ -63,27 +68,15 @@ NND.hotdeck <- function (data.rec, data.don, match.vars, don.class = NULL,
             d.lab <- paste("don", 1:nr, sep = "=")
         if (dfun == "Euclidean" || dfun == "euclidean" || dfun == 
             "Manhattan" || dfun == "manhattan") {
-            if (is.data.frame(x.rec)) 
-                x.rec <- fact2dummy(x.rec, all = TRUE)
-            if (is.data.frame(x.don)) 
-                x.don <- fact2dummy(x.don, all = TRUE)
             mdist <- proxy::dist(x = x.rec, y = x.don, method = dfun, 
                           ...)
         }
         else if (dfun == "Mahalanobis" || dfun == "mahalanobis") {
-            if (is.data.frame(x.rec)) 
-                x.rec <- fact2dummy(x.rec, all = TRUE)
-            if (is.data.frame(x.don)) 
-                x.don <- fact2dummy(x.don, all = TRUE)
             mdist <- mahalanobis.dist(data.x = x.rec, data.y = x.don, 
                                       ...)
         }
         else if (dfun == "minimax" || dfun == "MiniMax" || dfun == 
                  "Minimax") {
-            if (is.data.frame(x.rec)) 
-                x.rec <- fact2dummy(x.rec, all = TRUE)
-            if (is.data.frame(x.don)) 
-                x.don <- fact2dummy(x.don, all = TRUE)
             mdist <- maximum.dist(data.x = x.rec, data.y = x.don, 
                                   ...)
         }
@@ -106,6 +99,12 @@ NND.hotdeck <- function (data.rec, data.don, match.vars, don.class = NULL,
             mdist[is.nan(mdist)] <- 1
             mdist[is.na(mdist)] <- 1
         }
+        # else if (tolower(dfun) == "modgower") {
+        #     mdist <- gower.dist.mod(data.x = x.rec, data.y = x.don, 
+        #                         ...)
+        #     mdist[is.nan(mdist)] <- 1
+        #     mdist[is.na(mdist)] <- 1
+        # }
         else {
             mdist <- proxy::dist(x = x.rec, y = x.don, method = dfun, 
                           ...)
@@ -172,7 +171,7 @@ NND.hotdeck <- function (data.rec, data.don, match.vars, don.class = NULL,
         if (constr && (c.alg == "Hungarian" || c.alg == "hungarian")) {
             if (nr > nd) 
                 stop("When using the Hungarian algorithm \n the no. of donors must be greater \n or equal than the no. of recipients")
-            sol <- solve_LSAP(x = mdist, maximum = FALSE)
+            sol <- clue::solve_LSAP(x = mdist, maximum = FALSE)
             rec.lab <- r.lab
             don.lab <- d.lab[as.integer(sol)]
             dist.rd <- mdist[cbind(rec.lab, don.lab)]
@@ -185,6 +184,8 @@ NND.hotdeck <- function (data.rec, data.don, match.vars, don.class = NULL,
                           noad = nad, call = match.call())
         fine
     }
+##### END of the 'ghost' function NND.hd()
+##################################################
     if (is.null(don.class)) {
         out <- NND.hd(rec = data.rec[, match.vars, drop = FALSE], 
                       don = data.don[, match.vars, drop = FALSE], dfun = dist.fun, 

diff --git a/R/RANDwNND.hotdeck.R b/R/RANDwNND.hotdeck.R
@@ -27,10 +27,14 @@ function (data.rec, data.don, match.vars=NULL, don.class=NULL, dist.fun="Manhatt
 
     p <- length(match.vars)
     if(!is.null(match.vars)){
-        if(dist.fun=="Euclidean" || dist.fun=="euclidean" || dist.fun=="Manhattan" || dist.fun=="manhattan" || dist.fun=="Mahalanobis" || dist.fun=="mahalanobis" || dist.fun=="minimax" || dist.fun=="MiniMax" || dist.fun=="Minimax"){
-            cat("Warning: The ", dist.fun, " distance is being used", fill=TRUE)
-            cat("All the categorical matching variables in rec and don data.frames, \n if present, are recoded into dummies", fill=TRUE)
-        }
+        if (
+            (tolower(dist.fun) %in% c("euclidean", "manhattan", 
+                                      "mahalanobis", "minimax"))
+            & (!all(sapply(data.rec[,match.vars], is.numeric)))
+        )
+            stop("The chosen distance function requires numeric matching variables, \n
+                 with mixed-type matching variables please use Gower's distance")
+
         if(dist.fun=="exact" || dist.fun=="exact matching"){
             cat("Warning: the exact matching distance is being used", fill=TRUE)
             cat("all the matching variables in rec and don are converted to \n character variables and are treated as categorical nominal", fill=TRUE)
@@ -78,20 +82,17 @@ RANDwNND.hd <- function (rec, don, dfun="Manhattan", cut.don="rot", k=NULL, w.do
 
     if(dfun=="Euclidean" || dfun=="Manhattan"){
    #     require(proxy)
-        x.rec <- fact2dummy(x.rec, all=TRUE)
-        x.don <- fact2dummy(x.don, all=TRUE)
+
         mdist <- proxy::dist(x=x.rec, y=x.don, method=dfun, ...)
         dimnames(mdist) <- list(r.lab, d.lab)
     }
 	else if(dfun=="Mahalanobis" || dfun=="mahalanobis"){
-        if(is.data.frame(x.rec)) x.rec <- fact2dummy(x.rec, all=TRUE)
-        if(is.data.frame(x.don)) x.don <- fact2dummy(x.don, all=TRUE)
+
         mdist <- mahalanobis.dist(data.x=x.rec, data.y=x.don, ...)
         dimnames(mdist) <- list(r.lab, d.lab)
 	}
 	else if(dfun=="minimax" || dfun=="MiniMax" || dfun=="Minimax"){
-        x.rec <- fact2dummy(x.rec, all=TRUE)
-        x.don <- fact2dummy(x.don, all=TRUE)
+
         mdist <- maximum.dist(data.x=x.rec, data.y=x.don, ...)
         dimnames(mdist) <- list(r.lab, d.lab)
 	}

diff --git a/R/fact2dummy.R b/R/fact2dummy.R
@@ -1,12 +1,14 @@
 `fact2dummy` <-
 function (data, all=TRUE, lab="x") 
 {
+#########################################
 	dum.fcn <- function(x, all=TRUE){
-        fine <- model.matrix(~x-1)
+        fine <- model.matrix.lm(~x-1, na.action = "na.pass")
 		colnames(fine) <- levels(x)
 		if(!all) fine <- fine[,-ncol(fine), drop=FALSE]
 		fine
-    }
+	}
+###########################################
 
 	if(is.null(dim(data))){
 		if(class(data)[1]=="numeric" || class(data)[1]=="integer" || class(data)[1]=="logical") oo <- cbind(1*data)
@@ -40,10 +42,12 @@ function (data, all=TRUE, lab="x")
             # oo <- unlist(out)
             # oo <- matrix(oo, nrow=n)
             # dimnames(oo) <- list(row.names(data), unlist(lapply(out, colnames)))
+		    #cat(unlist(lapply(out, nrow)), fill=T)
 		    oo <- do.call("cbind", out)
 		}
 	}	
 	rownames(oo) <- rownames(data)
 	oo
+
 }
 
diff --git a/R/mixed.mtc.R b/R/mixed.mtc.R
@@ -218,7 +218,7 @@ function (data.rec, data.don, match.vars, y.rec, z.don, method="ML", rho.yz=NULL
 		        r.rhs <- rep(1, nA)
 		        c.sig <- rep("<=", nB)
 		        c.rhs <- rep(1, nB)
-		        appo <- lp.transport(cost.mat=madist, row.signs=r.sig, row.rhs=r.rhs, col.signs=c.sig, col.rhs=c.rhs)
+		        appo <- lpSolve::lp.transport(cost.mat=madist, row.signs=r.sig, row.rhs=r.rhs, col.signs=c.sig, col.rhs=c.rhs)
 		    }   
 		    sol <- appo$solution
 		    ss <- c(t(sol))
@@ -231,7 +231,7 @@ function (data.rec, data.don, match.vars, y.rec, z.don, method="ML", rho.yz=NULL
 		else if(constr.alg=="hungarian" || constr.alg=="Hungarian"){
 		    if(nA > nB) stop("It is required  that the no. of donors \n 
 		                      is equal or greater than the no. of recipients")
-		    sol <- solve_LSAP(x=madist, maximum=FALSE)
+		    sol <- clue::solve_LSAP(x=madist, maximum=FALSE)
 		    don.lab <- B.lab[as.integer(sol)]
 		    dist.rd <- madist[cbind(A.lab, don.lab)]
         }