version 1.0.2

cran · Jul 4, 2011 · 05b054b · 05b054b
1 parent f59e93f
commit 05b054b
Show file tree

Hide file tree

Showing 20 changed files with 234 additions and 133 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: StatMatch
 Type: Package
 Title: Statistical Matching
-Version: 1.0.1
-Date: 2011-02-28
+Version: 1.0.2
+Date: 2011-06-22
 Author: Marcello D'Orazio
 Maintainer: Marcello D'Orazio <madorazi@istat.it>
 Depends: R (>= 2.7.0), proxy, lpSolve, survey
@@ -11,9 +11,9 @@ Description: This package provides some R functions to perform
         statistical matching between two data sources sharing a number
         of common variables. Some functions can also be used to impute
         missing values in data sets through hot deck imputation
-        methods. Methods to deal with data from complex sample surveys
-        are available too.
-License: GPL (>= 2)
-LazyLoad: no
+        methods. Methods to perform statistical atching when dealing
+        with data from complex sample surveys are available too.
+License: EUPL
+Packaged: 2011-07-04 06:33:30 UTC; madorazi
 Repository: CRAN
-Date/Publication: 2011-03-12 18:08:38
+Date/Publication: 2011-07-04 08:23:03
diff --git a/NEWS b/NEWS
@@ -1,3 +1,11 @@
+1.0.2   new function mahalanobis.dist() to compute the mahalanobis distance
+
+		fixed a bug in mixed.mtc() when computing the range of admissible values
+		for rho_yz
+
+		fixed a bug in NND.hotdeck()  and RANDwNND.hotdeck() when
+		managing of the row.names
+
 1.0.1	new functions harmonize.x() and comb.samples() to perform statistical
 		matching when dealing with complex sample survey data via 
 		weight calibration.

diff --git a/R/NND.hotdeck.R b/R/NND.hotdeck.R
@@ -1,14 +1,12 @@
 `NND.hotdeck` <-
 function (data.rec, data.don, match.vars, don.class=NULL, dist.fun="Manhattan", constrained=FALSE, constr.alg=NULL, ...)
 {
-
 	if(constrained && constr.alg=="relax"){
 		require(optmatch)
 	}
 	if(constrained && (constr.alg=="lpSolve" || constr.alg=="lpsolve")){
 		require(lpSolve)
 	}
-
 	p <- length(match.vars)
 	if(!is.null(dim(data.rec))){
 		nr <- nrow(data.rec)
@@ -35,7 +33,7 @@ function (data.rec, data.don, match.vars, don.class=NULL, dist.fun="Manhattan",
 	else d.lab <- paste("don", d.lab, sep="=")
 	row.names(data.don) <- d.lab
     if(!is.null(match.vars)){
-        if(dist.fun=="Euclidean" || dist.fun=="euclidean" ||dist.fun=="Manhattan" ||dist.fun=="manhattan" || dist.fun=="minimax" || dist.fun=="MiniMax" || dist.fun=="Minimax"){
+        if(dist.fun=="Euclidean" || dist.fun=="euclidean" ||dist.fun=="Manhattan" || dist.fun=="Mahalanobis" || dist.fun=="mahalanobis" || dist.fun=="manhattan" || dist.fun=="minimax" || dist.fun=="MiniMax" || dist.fun=="Minimax"){
             cat("Warning: The ", dist.fun, " distance is being used", fill=TRUE)
             cat("All the categorical matching variables in rec and don data.frames, if present are recoded into dummies", fill=TRUE)
         }
@@ -48,32 +46,34 @@ function (data.rec, data.don, match.vars, don.class=NULL, dist.fun="Manhattan",
 ########################
 NND.hd <- function (rec, don, dfun="Manhattan", constr=FALSE, c.alg=NULL, ...)
 { 
-    if(is.null(dim(rec))) x.rec <- data.frame(rec)
-	else x.rec <- rec
-    if(is.null(dim(don))) x.don <- data.frame(don)
-	else x.don <- don
+    x.rec <- rec
+    x.don <- don
     p <- ncol(rec)
 	nr <- nrow(x.rec)
 	nd <- nrow(x.don)
 	if(nr>nd) cat("Warning: the number of donors is less than the number of recipients", fill=TRUE)
 
     r.lab <- rownames(x.rec)
-	if(is.null(r.lab)) r.lab <- 1:nr
+	if(is.null(r.lab)) r.lab <- paste("rec", 1:nr, sep="=")
 	d.lab <- rownames(x.don)
-	if(is.null(d.lab)) d.lab <- 1:nd
+	if(is.null(d.lab)) d.lab <-  paste("don", 1:nr, sep="=")
 
 # compute matrix of distances between obs. in x.don and obs. in x.rec
 # function dist() in package "proxy" is used! 
-
 	if(dfun=="Euclidean" || dfun=="euclidean" || dfun=="Manhattan" || dfun=="manhattan"){
         require(proxy)
-        x.rec <- fact2dummy(x.rec, all=FALSE)
-        x.don <- fact2dummy(x.don, all=FALSE)
+        if(is.data.frame(x.rec)) x.rec <- fact2dummy(x.rec, all=FALSE)
+        if(is.data.frame(x.don)) x.don <- fact2dummy(x.don, all=FALSE)
         mdist <- dist(x=x.rec, y=x.don, method=dfun, ...)
 	}
+	else if(dfun=="Mahalanobis" || dfun=="mahalanobis"){
+        if(is.data.frame(x.rec)) x.rec <- fact2dummy(x.rec, all=FALSE)
+        if(is.data.frame(x.don)) x.don <- fact2dummy(x.don, all=FALSE)
+        mdist <- mahalanobis.dist(data.x=x.rec, data.y=x.don, ...)
+	}
 	else if(dfun=="minimax" || dfun=="MiniMax" || dfun=="Minimax"){
-        x.rec <- fact2dummy(x.rec, all=FALSE)
-        x.don <- fact2dummy(x.don, all=FALSE)
+        if(is.data.frame(x.rec)) x.rec <- fact2dummy(x.rec, all=FALSE)
+        if(is.data.frame(x.don)) x.don <- fact2dummy(x.don, all=FALSE)
         mdist <- maximum.dist(data.x=x.rec, data.y=x.don, ...)
 	}
 	else if(dfun=="exact" || dfun=="exact matching"){
@@ -97,7 +97,7 @@ NND.hd <- function (rec, don, dfun="Manhattan", constr=FALSE, c.alg=NULL, ...)
         require(proxy)
 		mdist <- dist(x=x.rec, y=x.don, method=dfun, ...)
 	}
-	dimnames(mdist) <- list(r.lab, d.lab)
+    dimnames(mdist) <- list(r.lab, d.lab)
 
 # UNCONSTRAINED nearest neighbour matching
 
@@ -174,7 +174,7 @@ NND.hd <- function (rec, don, dfun="Manhattan", constr=FALSE, c.alg=NULL, ...)
 ################ NND.hd ends here #############################
 
 	if(is.null(don.class)){ 
-		out <- NND.hd(rec=data.rec[,match.vars], don=data.don[,match.vars], dfun=dist.fun, constr=constrained, c.alg=constr.alg )
+		out <- NND.hd(rec=data.rec[,match.vars, drop=FALSE], don=data.don[,match.vars, drop=FALSE], dfun=dist.fun, constr=constrained, c.alg=constr.alg )
 		mmm <- out$mtc.ids
 		mmm <- substring(mmm, 5)
 		if(is.null(rownames(data.rec)) && is.null(rownames(data.don)))  mtc.ids <- matrix(as.numeric(mmm), ncol=2, byrow=TRUE)
@@ -185,12 +185,12 @@ NND.hd <- function (rec, don, dfun="Manhattan", constr=FALSE, c.alg=NULL, ...)
 	}
 	else{
 		if(length(don.class)==1){
-			l.rec <- split(data.rec[ ,match.vars], f=data.rec[ ,don.class])
-			l.don <- split(data.don[ ,match.vars], f=data.don[ ,don.class])
+			l.rec <- split(data.rec[ ,match.vars, drop=FALSE], f=data.rec[ ,don.class])
+			l.don <- split(data.don[ ,match.vars, drop=FALSE], f=data.don[ ,don.class])
 		}
 		else{
-			l.rec <- split(data.rec[ ,match.vars], f=as.list(data.rec[ ,don.class]))
-			l.don <- split(data.don[ ,match.vars], f=as.list(data.don[ ,don.class]))
+			l.rec <- split(data.rec[ ,match.vars, drop=FALSE], f=as.list(data.rec[ ,don.class]))
+			l.don <- split(data.don[ ,match.vars, drop=FALSE], f=as.list(data.don[ ,don.class]))
 		}
 		if(length(l.rec)!=length(l.don)){
 			cat("The no. of donation classes in recipient data is not equal to the no. of donation classes in donor data", fill=TRUE)

diff --git a/R/RANDwNND.hotdeck.R b/R/RANDwNND.hotdeck.R
@@ -25,7 +25,7 @@ function (data.rec, data.don, match.vars=NULL, don.class=NULL, dist.fun="Manhatt
 	row.names(data.don) <- d.lab
 	p <- length(match.vars)
     if(!is.null(match.vars)){
-        if(dist.fun=="Euclidean" || dist.fun=="euclidean" ||dist.fun=="Manhattan" ||dist.fun=="manhattan" || dist.fun=="minimax" || dist.fun=="MiniMax" || dist.fun=="Minimax"){
+        if(dist.fun=="Euclidean" || dist.fun=="euclidean" || dist.fun=="Manhattan" || dist.fun=="manhattan" || dist.fun=="Mahalanobis" || dist.fun=="mahalanobis" || dist.fun=="minimax" || dist.fun=="MiniMax" || dist.fun=="Minimax"){
             cat("Warning: The ", dist.fun, " distance is being used", fill=TRUE)
             cat("All the categorical matching variables in rec and don data.frames, if present, are recoded into dummies", fill=TRUE)
         }
@@ -37,18 +37,18 @@ function (data.rec, data.don, match.vars=NULL, don.class=NULL, dist.fun="Manhatt
 ################
 RANDwNND.hd <- function (rec, don, dfun="Manhattan", cut.don="rot", k=NULL, w.don=NULL, ...)
 { 
-    if(is.null(dim(rec))) x.rec <- data.frame(rec)
-    else x.rec <- rec
-    if(is.null(dim(don))) x.rec <- data.frame(don)
-    else x.don <- don
+    x.rec <- rec
+    x.don <- don
     p <- ncol(rec)
   	nr <- nrow(rec)
 	nd <- nrow(don)
 	if(nr>nd) cat("Warning: the number of donors is less than the number of recipients", fill=TRUE)
-	r.lab <- rownames(rec)
-	if(is.null(r.lab)) r.lab <- 1:nr
-	d.lab <- rownames(don)
-	if(is.null(d.lab)) d.lab <- 1:nd
+
+    r.lab <- rownames(x.rec)
+	if(is.null(r.lab)) r.lab <- paste("rec", 1:nr, sep="=")
+	d.lab <- rownames(x.don)
+	if(is.null(d.lab)) d.lab <-  paste("don", 1:nr, sep="=")
+
     if(is.null(w.don)) ww <- rep(1,nd)
     else ww <- w.don
 # compute matrix of distances between obs. in x.don and obs. in x.rec
@@ -59,6 +59,11 @@ RANDwNND.hd <- function (rec, don, dfun="Manhattan", cut.don="rot", k=NULL, w.do
         x.don <- fact2dummy(x.don, all=FALSE)
         mdist <- dist(x=x.rec, y=x.don, method=dfun, ...)
     }
+	else if(dfun=="Mahalanobis" || dfun=="mahalanobis"){
+        if(is.data.frame(x.rec)) x.rec <- fact2dummy(x.rec, all=FALSE)
+        if(is.data.frame(x.don)) x.don <- fact2dummy(x.don, all=FALSE)
+        mdist <- mahalanobis.dist(data.x=x.rec, data.y=x.don, ...)
+	}
 	else if(dfun=="minimax" || dfun=="MiniMax" || dfun=="Minimax"){
         x.rec <- fact2dummy(x.rec, all=FALSE)
         x.don <- fact2dummy(x.don, all=FALSE)

diff --git a/R/mahalanobis.dist.R b/R/mahalanobis.dist.R
@@ -0,0 +1,20 @@
+mahalanobis.dist <- function(data.x, data.y=NULL, vc=NULL){
+
+    xx <- as.matrix(data.x)
+    if(is.null(data.y)) yy <- as.matrix(data.x)
+    else yy <- as.matrix(data.y)
+
+    if(is.null(vc)){
+        if(is.null(data.y)) vc <- var(xx)
+        else vc <- var(rbind(xx,yy))
+    }
+
+    ny <- nrow(yy)
+    md <- matrix(0,nrow(xx), ny)
+    for(i in 1:ny){
+        md[,i] <- mahalanobis(xx, yy[i,], cov=vc)
+    }
+    if(is.null(data.y)) dimnames(md) <- list(rownames(data.x), rownames(data.x))
+    else dimnames(md) <- list(rownames(data.x), rownames(data.y))
+    md
+}
diff --git a/R/maximum.dist.r → R/maximum.dist.R b/R/maximum.dist.r → R/maximum.dist.R
diff --git a/R/mixed.mtc.R b/R/mixed.mtc.R
@@ -107,21 +107,25 @@ function (data.rec, data.don, match.vars, y.rec, z.don, method="ML", rho.yz=0, m
 
 		# estimation of S.yz
 		# step.1 checks if the input value for Cor(Y,Z), rho.yz, is admissible  
-		c.xy <- c(cor(x.A, y.A))
-		c.xz <- c(cor(x.B, z.B))
 		if(p.x==1){
+       		c.xy <- c(cor(x.A, y.A))
+    		c.xz <- c(cor(x.B, z.B))
 			low.c <- c.xy*c.xz - sqrt( (1-c.xy^2)*(1-c.xz^2) )
 			up.c <-  c.xy*c.xz + sqrt( (1-c.xy^2)*(1-c.xz^2) )
 		}      
 		else{
-			ic.x <- solve(cov2cor(S.x))
-			mc1 <- matrix( rep(c.xy, p.x), ncol=p.x )
-			mc2 <- matrix( rep(c.xz, p.x), ncol=p.x, byrow=TRUE)
-			cc <- mc1 * ic.x * mc2
-			dd1 <- 1 - sum( mc1 * ic.x * t(mc1) )
-			dd2 <- 1 - sum( t(mc2) * ic.x * mc2 )  
-			low.c <- sum(cc) - sqrt(dd1*dd2)
-			up.c <- sum(cc) + sqrt(dd1*dd2)
+            eps <- 0.0001
+            cc <- cov2cor(vc)
+            rr <- seq(-1, 1, eps)
+            k <- length(rr)
+            vdet <- rep(0,k)
+            for(i in 1:k){
+                cc[pos.z, pos.y] <- cc[pos.y, pos.z] <- rr[i]
+                vdet[i] <- det(cc)
+            }
+            cc.yz <- rr[vdet>=0]
+            low.c <- min(cc.yz)
+            up.c <- max(cc.yz)
 		}
 		# step.2 checks whether the input value rho.yz for Cor(Y,Z) is addmisible. Otherwise takes the closest admissible value.
 		cat("input value for rho.yz is", rho.yz, fill=TRUE)

diff --git a/man/Fbwidths.by.x.Rd b/man/Fbwidths.by.x.Rd
@@ -4,7 +4,7 @@
 \title{Computes the Frechet bounds of cells in a contingency table by considering all the possible subsets of the common variables.}
 
 \description{
-  This function permits to compute the bounds for cell probabilities in the contingency table Y vs. Z starting from the marginal tables (\bold{X} vs. Y), (\bold{X} vs. Z) and the joint distribution of the \bold{X} variables, by considering all the possible subsets of the \bold{X} variables.  In this manner it is possible to identify which subset of the \bold{X} variables produces the major reduction of the uncertainty measured in terms of the bounds width.
+  This function permits to compute the bounds for cell probabilities in the contingency table Y vs. Z starting from the marginal tables (\bold{X} vs. Y), (\bold{X} vs. Z) and the joint distribution of the \bold{X} variables, by considering all the possible subsets of the \bold{X} variables.  In this manner it is possible to identify which subset of the \bold{X} variables produces the major reduction of the uncertainty measured in terms of the width of the bounds.
 }
 
 \usage{
@@ -36,7 +36,7 @@ A single categorical Z variable is allowed.  One or more categorical variables c
 
 \details{
 This function permits to compute the Frechet bounds for the frequencies in the contingency table of Y vs. Z, starting from the conditional distributions P(Y|\bold{X}) and P(Z|\bold{X}) (for details see \cr
-\code{\link[StatMatch]{Frechet.bounds.cat}}), by considering all the possible subsets of the \bold{X}.  In this manner it is possible to identify the subset of the \bold{X} variables, with highest association with both Y and Z, that permits to reduce the uncertainty concerning the distribution of Y vs. Z.  The reduction of the uncertainty is measured in terms of the average of the widths of the bounds estimated for the cells in the table of Y vs. Z:
+\code{\link[StatMatch]{Frechet.bounds.cat}}), by considering all the possible subsets of the \bold{X} variables.  In this manner it is possible to identify the subset of the \bold{X} variables, with highest association with both Y and Z, that permits to reduce the uncertainty concerning the distribution of Y vs. Z.  The reduction of the uncertainty is measured in terms of the average of the widths of the bounds for the cells in the table of Y vs. Z:
 
 \deqn{ \bar{d} = \frac{1}{J \times K} \sum_{j,k} ( p^{up}_{j,k} - p^{low}_{j,k} )}{d=(1/(J*K))*sum_(i,k)(up(p(y=j,z=k))-low(p(y=j,z=k)))}
 
@@ -65,7 +65,7 @@ D'Orazio, M., Di Zio, M. and Scanu, M. (2006). \emph{Statistical Matching: Theor
 }
 
 \seealso{ 
-\code{\link[StatMatch]{Frechet.bounds.cat}}
+\code{\link[StatMatch]{Frechet.bounds.cat}}, \code{\link[StatMatch]{harmonize.x}}
 }
 
 \examples{

diff --git a/man/Frechet.bounds.cat.Rd b/man/Frechet.bounds.cat.Rd
@@ -22,7 +22,7 @@ A \R table crossing the \bold{X} variables.  This table must be obtained by usin
 A \R table of \bold{X} vs. Y variable.  This table must be obtained by using the function \code{\link[stats]{xtabs}} or \code{\link[base]{table}}, e.g. \cr
 \code{table.xy <- xtabs(~x1+x2+x3+y, data=data.A)}.
 
-A single categorical Y variables is allowed.  One or more categorical variables can be considered as \bold{X} variables (common variables).  Obviously, the same \bold{X} variables in \code{tab.x} must be available in \code{tab.xy}.  Moreover, it is assumed that the joint distribution of the \bold{X} variables computed from \code{tab.xy} is equal to \code{tab.x}; a warning appears if this is not true.
+A single categorical Y variable is allowed.  One or more categorical variables can be considered as \bold{X} variables (common variables).  Obviously, the same \bold{X} variables in \code{tab.x} must be available in \code{tab.xy}.  Moreover, it is assumed that the joint distribution of the \bold{X} variables computed from \code{tab.xy} is equal to \code{tab.x}; a warning appears if this is not true.
 }
 
 \item{tab.xz}{
@@ -39,7 +39,7 @@ A string specifying the data structure of the output. When \cr \code{print.f="ta
 }
 
 \details{
-This function permits to compute the Frechet bounds for the relative frequencies in the contingency table of Y vs.Z, starting from the conditional distributions P(Y|X) and P(Z|X).  The bounds for the relative frequencies \eqn{p_{j,k}}{p(y=j,z=k)} in the table Y vs. Z are:
+This function permits to compute the Frechet bounds for the relative frequencies in the contingency table of Y vs.Z, starting from the distributions P(Y|X), P(Z|X) and P(X).  The bounds for the relative frequencies \eqn{p_{j,k}}{p(y=j,z=k)} in the table Y vs. Z are:
 
 \deqn{  p^{low}_{YZ}(j,k) = \sum_{i}  p_X(i)\max (0; p_{Y|X}(j|i) + p_{Z|X}(k|i)-1 ) }{p(y=j,z=k) >= sum_i(p(x=i) * max(0; p(y=j|x=i) + p(z=k|x=i) - 1) )}
 
@@ -50,9 +50,9 @@ The relative frequencies \eqn{p_X(i)=n_i/n}{p(x=i)=n_i/n \bullet} are computed f
 the relative frequencies \eqn{p_{Y|X}(j|i)=n_{ij}/n_{i \bullet}}{p(y=j|x=i)=n_ij/n_i.} are computed from the \code{tab.xy}, \cr
 finally, \eqn{p_{Z|X}(k|i)=n_{ik}/n_{k \bullet}}{p(z=k|x=i)=n_ik/n_i.} are derived from \code{tab.xy}.
 
-The underlying assumption is that the marginal distribution of the \bold{X} variables to be the same in all the input tables: \code{tab.x}, \code{tab.xy} and \code{tab.xz}.  If this is not true a warning message will appear.
+It is assumed that the marginal distribution of the \bold{X} variables is the same in all the input tables: \code{tab.x}, \code{tab.xy} and \code{tab.xz}.  If this is not true a warning message will appear.
 
-Note that the cells bounds for the the relative frequencies in the contingency table of Y vs.Z are also computed  without considering the \bold{X} variables, in the following manner:
+Note that the cells bounds for the relative frequencies in the contingency table of Y vs. Z are computed also without considering the \bold{X} variables:
 
 \deqn{ \max\{0; p_{Y}(j) + p_{Z}(k)-1\} \leq p_{YZ}(j,k) \leq \min \{ p_{Y}(j); p_{Z}(k)\}}{max(0;p(y=j)+p(z=k)-1) <= p(y=j,z=k) <= min(p(y=j);p(z=k))}
 
@@ -63,7 +63,7 @@ Finally, the contingency table of Y vs. Z estimated under the Conditional Indepe
 }
 
 \value{
-When \code{print.f="tables"} (default) a list with the following estimated tables of Y vs. Z:
+When \code{print.f="tables"} (default) a list with the following tables:
 
 \item{low.u}{
 The estimated lower bounds for the relative frequencies in the table Y vs. Z without conditioning on the \bold{X} variables.
@@ -103,7 +103,7 @@ D'Orazio, M., Di Zio, M. and Scanu, M. (2006). \emph{Statistical Matching: Theor
 }
 
 \seealso{ 
-\code{\link[StatMatch]{Fbwidths.by.x}}
+\code{\link[StatMatch]{Fbwidths.by.x}}, \code{\link[StatMatch]{harmonize.x}}
 }
 
 \examples{