Skip to content

Commit

Permalink
version 1.0.2
Browse files Browse the repository at this point in the history
  • Loading branch information
marcellodo authored and cran-robot committed Jul 4, 2011
1 parent f59e93f commit 05b054b
Show file tree
Hide file tree
Showing 20 changed files with 234 additions and 133 deletions.
14 changes: 7 additions & 7 deletions DESCRIPTION
@@ -1,8 +1,8 @@
Package: StatMatch
Type: Package
Title: Statistical Matching
Version: 1.0.1
Date: 2011-02-28
Version: 1.0.2
Date: 2011-06-22
Author: Marcello D'Orazio
Maintainer: Marcello D'Orazio <madorazi@istat.it>
Depends: R (>= 2.7.0), proxy, lpSolve, survey
Expand All @@ -11,9 +11,9 @@ Description: This package provides some R functions to perform
statistical matching between two data sources sharing a number
of common variables. Some functions can also be used to impute
missing values in data sets through hot deck imputation
methods. Methods to deal with data from complex sample surveys
are available too.
License: GPL (>= 2)
LazyLoad: no
methods. Methods to perform statistical atching when dealing
with data from complex sample surveys are available too.
License: EUPL
Packaged: 2011-07-04 06:33:30 UTC; madorazi
Repository: CRAN
Date/Publication: 2011-03-12 18:08:38
Date/Publication: 2011-07-04 08:23:03
8 changes: 8 additions & 0 deletions NEWS
@@ -1,3 +1,11 @@
1.0.2 new function mahalanobis.dist() to compute the mahalanobis distance

fixed a bug in mixed.mtc() when computing the range of admissible values
for rho_yz

fixed a bug in NND.hotdeck() and RANDwNND.hotdeck() when
managing of the row.names

1.0.1 new functions harmonize.x() and comb.samples() to perform statistical
matching when dealing with complex sample survey data via
weight calibration.
Expand Down
40 changes: 20 additions & 20 deletions R/NND.hotdeck.R
@@ -1,14 +1,12 @@
`NND.hotdeck` <-
function (data.rec, data.don, match.vars, don.class=NULL, dist.fun="Manhattan", constrained=FALSE, constr.alg=NULL, ...)
{

if(constrained && constr.alg=="relax"){
require(optmatch)
}
if(constrained && (constr.alg=="lpSolve" || constr.alg=="lpsolve")){
require(lpSolve)
}

p <- length(match.vars)
if(!is.null(dim(data.rec))){
nr <- nrow(data.rec)
Expand All @@ -35,7 +33,7 @@ function (data.rec, data.don, match.vars, don.class=NULL, dist.fun="Manhattan",
else d.lab <- paste("don", d.lab, sep="=")
row.names(data.don) <- d.lab
if(!is.null(match.vars)){
if(dist.fun=="Euclidean" || dist.fun=="euclidean" ||dist.fun=="Manhattan" ||dist.fun=="manhattan" || dist.fun=="minimax" || dist.fun=="MiniMax" || dist.fun=="Minimax"){
if(dist.fun=="Euclidean" || dist.fun=="euclidean" ||dist.fun=="Manhattan" || dist.fun=="Mahalanobis" || dist.fun=="mahalanobis" || dist.fun=="manhattan" || dist.fun=="minimax" || dist.fun=="MiniMax" || dist.fun=="Minimax"){
cat("Warning: The ", dist.fun, " distance is being used", fill=TRUE)
cat("All the categorical matching variables in rec and don data.frames, if present are recoded into dummies", fill=TRUE)
}
Expand All @@ -48,32 +46,34 @@ function (data.rec, data.don, match.vars, don.class=NULL, dist.fun="Manhattan",
########################
NND.hd <- function (rec, don, dfun="Manhattan", constr=FALSE, c.alg=NULL, ...)
{
if(is.null(dim(rec))) x.rec <- data.frame(rec)
else x.rec <- rec
if(is.null(dim(don))) x.don <- data.frame(don)
else x.don <- don
x.rec <- rec
x.don <- don
p <- ncol(rec)
nr <- nrow(x.rec)
nd <- nrow(x.don)
if(nr>nd) cat("Warning: the number of donors is less than the number of recipients", fill=TRUE)

r.lab <- rownames(x.rec)
if(is.null(r.lab)) r.lab <- 1:nr
if(is.null(r.lab)) r.lab <- paste("rec", 1:nr, sep="=")
d.lab <- rownames(x.don)
if(is.null(d.lab)) d.lab <- 1:nd
if(is.null(d.lab)) d.lab <- paste("don", 1:nr, sep="=")

# compute matrix of distances between obs. in x.don and obs. in x.rec
# function dist() in package "proxy" is used!

if(dfun=="Euclidean" || dfun=="euclidean" || dfun=="Manhattan" || dfun=="manhattan"){
require(proxy)
x.rec <- fact2dummy(x.rec, all=FALSE)
x.don <- fact2dummy(x.don, all=FALSE)
if(is.data.frame(x.rec)) x.rec <- fact2dummy(x.rec, all=FALSE)
if(is.data.frame(x.don)) x.don <- fact2dummy(x.don, all=FALSE)
mdist <- dist(x=x.rec, y=x.don, method=dfun, ...)
}
else if(dfun=="Mahalanobis" || dfun=="mahalanobis"){
if(is.data.frame(x.rec)) x.rec <- fact2dummy(x.rec, all=FALSE)
if(is.data.frame(x.don)) x.don <- fact2dummy(x.don, all=FALSE)
mdist <- mahalanobis.dist(data.x=x.rec, data.y=x.don, ...)
}
else if(dfun=="minimax" || dfun=="MiniMax" || dfun=="Minimax"){
x.rec <- fact2dummy(x.rec, all=FALSE)
x.don <- fact2dummy(x.don, all=FALSE)
if(is.data.frame(x.rec)) x.rec <- fact2dummy(x.rec, all=FALSE)
if(is.data.frame(x.don)) x.don <- fact2dummy(x.don, all=FALSE)
mdist <- maximum.dist(data.x=x.rec, data.y=x.don, ...)
}
else if(dfun=="exact" || dfun=="exact matching"){
Expand All @@ -97,7 +97,7 @@ NND.hd <- function (rec, don, dfun="Manhattan", constr=FALSE, c.alg=NULL, ...)
require(proxy)
mdist <- dist(x=x.rec, y=x.don, method=dfun, ...)
}
dimnames(mdist) <- list(r.lab, d.lab)
dimnames(mdist) <- list(r.lab, d.lab)

# UNCONSTRAINED nearest neighbour matching

Expand Down Expand Up @@ -174,7 +174,7 @@ NND.hd <- function (rec, don, dfun="Manhattan", constr=FALSE, c.alg=NULL, ...)
################ NND.hd ends here #############################

if(is.null(don.class)){
out <- NND.hd(rec=data.rec[,match.vars], don=data.don[,match.vars], dfun=dist.fun, constr=constrained, c.alg=constr.alg )
out <- NND.hd(rec=data.rec[,match.vars, drop=FALSE], don=data.don[,match.vars, drop=FALSE], dfun=dist.fun, constr=constrained, c.alg=constr.alg )
mmm <- out$mtc.ids
mmm <- substring(mmm, 5)
if(is.null(rownames(data.rec)) && is.null(rownames(data.don))) mtc.ids <- matrix(as.numeric(mmm), ncol=2, byrow=TRUE)
Expand All @@ -185,12 +185,12 @@ NND.hd <- function (rec, don, dfun="Manhattan", constr=FALSE, c.alg=NULL, ...)
}
else{
if(length(don.class)==1){
l.rec <- split(data.rec[ ,match.vars], f=data.rec[ ,don.class])
l.don <- split(data.don[ ,match.vars], f=data.don[ ,don.class])
l.rec <- split(data.rec[ ,match.vars, drop=FALSE], f=data.rec[ ,don.class])
l.don <- split(data.don[ ,match.vars, drop=FALSE], f=data.don[ ,don.class])
}
else{
l.rec <- split(data.rec[ ,match.vars], f=as.list(data.rec[ ,don.class]))
l.don <- split(data.don[ ,match.vars], f=as.list(data.don[ ,don.class]))
l.rec <- split(data.rec[ ,match.vars, drop=FALSE], f=as.list(data.rec[ ,don.class]))
l.don <- split(data.don[ ,match.vars, drop=FALSE], f=as.list(data.don[ ,don.class]))
}
if(length(l.rec)!=length(l.don)){
cat("The no. of donation classes in recipient data is not equal to the no. of donation classes in donor data", fill=TRUE)
Expand Down
23 changes: 14 additions & 9 deletions R/RANDwNND.hotdeck.R
Expand Up @@ -25,7 +25,7 @@ function (data.rec, data.don, match.vars=NULL, don.class=NULL, dist.fun="Manhatt
row.names(data.don) <- d.lab
p <- length(match.vars)
if(!is.null(match.vars)){
if(dist.fun=="Euclidean" || dist.fun=="euclidean" ||dist.fun=="Manhattan" ||dist.fun=="manhattan" || dist.fun=="minimax" || dist.fun=="MiniMax" || dist.fun=="Minimax"){
if(dist.fun=="Euclidean" || dist.fun=="euclidean" || dist.fun=="Manhattan" || dist.fun=="manhattan" || dist.fun=="Mahalanobis" || dist.fun=="mahalanobis" || dist.fun=="minimax" || dist.fun=="MiniMax" || dist.fun=="Minimax"){
cat("Warning: The ", dist.fun, " distance is being used", fill=TRUE)
cat("All the categorical matching variables in rec and don data.frames, if present, are recoded into dummies", fill=TRUE)
}
Expand All @@ -37,18 +37,18 @@ function (data.rec, data.don, match.vars=NULL, don.class=NULL, dist.fun="Manhatt
################
RANDwNND.hd <- function (rec, don, dfun="Manhattan", cut.don="rot", k=NULL, w.don=NULL, ...)
{
if(is.null(dim(rec))) x.rec <- data.frame(rec)
else x.rec <- rec
if(is.null(dim(don))) x.rec <- data.frame(don)
else x.don <- don
x.rec <- rec
x.don <- don
p <- ncol(rec)
nr <- nrow(rec)
nd <- nrow(don)
if(nr>nd) cat("Warning: the number of donors is less than the number of recipients", fill=TRUE)
r.lab <- rownames(rec)
if(is.null(r.lab)) r.lab <- 1:nr
d.lab <- rownames(don)
if(is.null(d.lab)) d.lab <- 1:nd

r.lab <- rownames(x.rec)
if(is.null(r.lab)) r.lab <- paste("rec", 1:nr, sep="=")
d.lab <- rownames(x.don)
if(is.null(d.lab)) d.lab <- paste("don", 1:nr, sep="=")

if(is.null(w.don)) ww <- rep(1,nd)
else ww <- w.don
# compute matrix of distances between obs. in x.don and obs. in x.rec
Expand All @@ -59,6 +59,11 @@ RANDwNND.hd <- function (rec, don, dfun="Manhattan", cut.don="rot", k=NULL, w.do
x.don <- fact2dummy(x.don, all=FALSE)
mdist <- dist(x=x.rec, y=x.don, method=dfun, ...)
}
else if(dfun=="Mahalanobis" || dfun=="mahalanobis"){
if(is.data.frame(x.rec)) x.rec <- fact2dummy(x.rec, all=FALSE)
if(is.data.frame(x.don)) x.don <- fact2dummy(x.don, all=FALSE)
mdist <- mahalanobis.dist(data.x=x.rec, data.y=x.don, ...)
}
else if(dfun=="minimax" || dfun=="MiniMax" || dfun=="Minimax"){
x.rec <- fact2dummy(x.rec, all=FALSE)
x.don <- fact2dummy(x.don, all=FALSE)
Expand Down
20 changes: 20 additions & 0 deletions R/mahalanobis.dist.R
@@ -0,0 +1,20 @@
mahalanobis.dist <- function(data.x, data.y=NULL, vc=NULL){

xx <- as.matrix(data.x)
if(is.null(data.y)) yy <- as.matrix(data.x)
else yy <- as.matrix(data.y)

if(is.null(vc)){
if(is.null(data.y)) vc <- var(xx)
else vc <- var(rbind(xx,yy))
}

ny <- nrow(yy)
md <- matrix(0,nrow(xx), ny)
for(i in 1:ny){
md[,i] <- mahalanobis(xx, yy[i,], cov=vc)
}
if(is.null(data.y)) dimnames(md) <- list(rownames(data.x), rownames(data.x))
else dimnames(md) <- list(rownames(data.x), rownames(data.y))
md
}
File renamed without changes.
24 changes: 14 additions & 10 deletions R/mixed.mtc.R
Expand Up @@ -107,21 +107,25 @@ function (data.rec, data.don, match.vars, y.rec, z.don, method="ML", rho.yz=0, m

# estimation of S.yz
# step.1 checks if the input value for Cor(Y,Z), rho.yz, is admissible
c.xy <- c(cor(x.A, y.A))
c.xz <- c(cor(x.B, z.B))
if(p.x==1){
c.xy <- c(cor(x.A, y.A))
c.xz <- c(cor(x.B, z.B))
low.c <- c.xy*c.xz - sqrt( (1-c.xy^2)*(1-c.xz^2) )
up.c <- c.xy*c.xz + sqrt( (1-c.xy^2)*(1-c.xz^2) )
}
else{
ic.x <- solve(cov2cor(S.x))
mc1 <- matrix( rep(c.xy, p.x), ncol=p.x )
mc2 <- matrix( rep(c.xz, p.x), ncol=p.x, byrow=TRUE)
cc <- mc1 * ic.x * mc2
dd1 <- 1 - sum( mc1 * ic.x * t(mc1) )
dd2 <- 1 - sum( t(mc2) * ic.x * mc2 )
low.c <- sum(cc) - sqrt(dd1*dd2)
up.c <- sum(cc) + sqrt(dd1*dd2)
eps <- 0.0001
cc <- cov2cor(vc)
rr <- seq(-1, 1, eps)
k <- length(rr)
vdet <- rep(0,k)
for(i in 1:k){
cc[pos.z, pos.y] <- cc[pos.y, pos.z] <- rr[i]
vdet[i] <- det(cc)
}
cc.yz <- rr[vdet>=0]
low.c <- min(cc.yz)
up.c <- max(cc.yz)
}
# step.2 checks whether the input value rho.yz for Cor(Y,Z) is addmisible. Otherwise takes the closest admissible value.
cat("input value for rho.yz is", rho.yz, fill=TRUE)
Expand Down
6 changes: 3 additions & 3 deletions man/Fbwidths.by.x.Rd
Expand Up @@ -4,7 +4,7 @@
\title{Computes the Frechet bounds of cells in a contingency table by considering all the possible subsets of the common variables.}

\description{
This function permits to compute the bounds for cell probabilities in the contingency table Y vs. Z starting from the marginal tables (\bold{X} vs. Y), (\bold{X} vs. Z) and the joint distribution of the \bold{X} variables, by considering all the possible subsets of the \bold{X} variables. In this manner it is possible to identify which subset of the \bold{X} variables produces the major reduction of the uncertainty measured in terms of the bounds width.
This function permits to compute the bounds for cell probabilities in the contingency table Y vs. Z starting from the marginal tables (\bold{X} vs. Y), (\bold{X} vs. Z) and the joint distribution of the \bold{X} variables, by considering all the possible subsets of the \bold{X} variables. In this manner it is possible to identify which subset of the \bold{X} variables produces the major reduction of the uncertainty measured in terms of the width of the bounds.
}

\usage{
Expand Down Expand Up @@ -36,7 +36,7 @@ A single categorical Z variable is allowed. One or more categorical variables c

\details{
This function permits to compute the Frechet bounds for the frequencies in the contingency table of Y vs. Z, starting from the conditional distributions P(Y|\bold{X}) and P(Z|\bold{X}) (for details see \cr
\code{\link[StatMatch]{Frechet.bounds.cat}}), by considering all the possible subsets of the \bold{X}. In this manner it is possible to identify the subset of the \bold{X} variables, with highest association with both Y and Z, that permits to reduce the uncertainty concerning the distribution of Y vs. Z. The reduction of the uncertainty is measured in terms of the average of the widths of the bounds estimated for the cells in the table of Y vs. Z:
\code{\link[StatMatch]{Frechet.bounds.cat}}), by considering all the possible subsets of the \bold{X} variables. In this manner it is possible to identify the subset of the \bold{X} variables, with highest association with both Y and Z, that permits to reduce the uncertainty concerning the distribution of Y vs. Z. The reduction of the uncertainty is measured in terms of the average of the widths of the bounds for the cells in the table of Y vs. Z:

\deqn{ \bar{d} = \frac{1}{J \times K} \sum_{j,k} ( p^{up}_{j,k} - p^{low}_{j,k} )}{d=(1/(J*K))*sum_(i,k)(up(p(y=j,z=k))-low(p(y=j,z=k)))}

Expand Down Expand Up @@ -65,7 +65,7 @@ D'Orazio, M., Di Zio, M. and Scanu, M. (2006). \emph{Statistical Matching: Theor
}
\seealso{
\code{\link[StatMatch]{Frechet.bounds.cat}}
\code{\link[StatMatch]{Frechet.bounds.cat}}, \code{\link[StatMatch]{harmonize.x}}
}
\examples{
Expand Down
12 changes: 6 additions & 6 deletions man/Frechet.bounds.cat.Rd
Expand Up @@ -22,7 +22,7 @@ A \R table crossing the \bold{X} variables. This table must be obtained by usin
A \R table of \bold{X} vs. Y variable. This table must be obtained by using the function \code{\link[stats]{xtabs}} or \code{\link[base]{table}}, e.g. \cr
\code{table.xy <- xtabs(~x1+x2+x3+y, data=data.A)}.

A single categorical Y variables is allowed. One or more categorical variables can be considered as \bold{X} variables (common variables). Obviously, the same \bold{X} variables in \code{tab.x} must be available in \code{tab.xy}. Moreover, it is assumed that the joint distribution of the \bold{X} variables computed from \code{tab.xy} is equal to \code{tab.x}; a warning appears if this is not true.
A single categorical Y variable is allowed. One or more categorical variables can be considered as \bold{X} variables (common variables). Obviously, the same \bold{X} variables in \code{tab.x} must be available in \code{tab.xy}. Moreover, it is assumed that the joint distribution of the \bold{X} variables computed from \code{tab.xy} is equal to \code{tab.x}; a warning appears if this is not true.
}

\item{tab.xz}{
Expand All @@ -39,7 +39,7 @@ A string specifying the data structure of the output. When \cr \code{print.f="ta
}

\details{
This function permits to compute the Frechet bounds for the relative frequencies in the contingency table of Y vs.Z, starting from the conditional distributions P(Y|X) and P(Z|X). The bounds for the relative frequencies \eqn{p_{j,k}}{p(y=j,z=k)} in the table Y vs. Z are:
This function permits to compute the Frechet bounds for the relative frequencies in the contingency table of Y vs.Z, starting from the distributions P(Y|X), P(Z|X) and P(X). The bounds for the relative frequencies \eqn{p_{j,k}}{p(y=j,z=k)} in the table Y vs. Z are:

\deqn{ p^{low}_{YZ}(j,k) = \sum_{i} p_X(i)\max (0; p_{Y|X}(j|i) + p_{Z|X}(k|i)-1 ) }{p(y=j,z=k) >= sum_i(p(x=i) * max(0; p(y=j|x=i) + p(z=k|x=i) - 1) )}

Expand All @@ -50,9 +50,9 @@ The relative frequencies \eqn{p_X(i)=n_i/n}{p(x=i)=n_i/n \bullet} are computed f
the relative frequencies \eqn{p_{Y|X}(j|i)=n_{ij}/n_{i \bullet}}{p(y=j|x=i)=n_ij/n_i.} are computed from the \code{tab.xy}, \cr
finally, \eqn{p_{Z|X}(k|i)=n_{ik}/n_{k \bullet}}{p(z=k|x=i)=n_ik/n_i.} are derived from \code{tab.xy}.

The underlying assumption is that the marginal distribution of the \bold{X} variables to be the same in all the input tables: \code{tab.x}, \code{tab.xy} and \code{tab.xz}. If this is not true a warning message will appear.
It is assumed that the marginal distribution of the \bold{X} variables is the same in all the input tables: \code{tab.x}, \code{tab.xy} and \code{tab.xz}. If this is not true a warning message will appear.

Note that the cells bounds for the the relative frequencies in the contingency table of Y vs.Z are also computed without considering the \bold{X} variables, in the following manner:
Note that the cells bounds for the relative frequencies in the contingency table of Y vs. Z are computed also without considering the \bold{X} variables:

\deqn{ \max\{0; p_{Y}(j) + p_{Z}(k)-1\} \leq p_{YZ}(j,k) \leq \min \{ p_{Y}(j); p_{Z}(k)\}}{max(0;p(y=j)+p(z=k)-1) <= p(y=j,z=k) <= min(p(y=j);p(z=k))}

Expand All @@ -63,7 +63,7 @@ Finally, the contingency table of Y vs. Z estimated under the Conditional Indepe
}

\value{
When \code{print.f="tables"} (default) a list with the following estimated tables of Y vs. Z:
When \code{print.f="tables"} (default) a list with the following tables:

\item{low.u}{
The estimated lower bounds for the relative frequencies in the table Y vs. Z without conditioning on the \bold{X} variables.
Expand Down Expand Up @@ -103,7 +103,7 @@ D'Orazio, M., Di Zio, M. and Scanu, M. (2006). \emph{Statistical Matching: Theor
}
\seealso{
\code{\link[StatMatch]{Fbwidths.by.x}}
\code{\link[StatMatch]{Fbwidths.by.x}}, \code{\link[StatMatch]{harmonize.x}}
}
\examples{
Expand Down

0 comments on commit 05b054b

Please sign in to comment.