version 3.4

cran · Feb 6, 2012 · e71f850 · e71f850
1 parent e072314
commit e71f850
Show file tree

Hide file tree

Showing 6 changed files with 91 additions and 27 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: rrBLUP
 Title: Ridge regression and other kernels for genomic selection
-Version: 3.3
+Version: 3.4
 Author: Jeffrey Endelman
 Maintainer: Jeffrey Endelman <j.endelman@gmail.com>
 Description: One application is to estimate marker effects by ridge
@@ -11,6 +11,6 @@ Description: One application is to estimate marker effects by ridge
 Suggests: multicore
 License: GPL-3
 LazyLoad: yes
-Packaged: 2012-01-31 22:43:49 UTC; jeffendelman
+Packaged: 2012-02-06 18:10:17 UTC; jeffendelman
 Repository: CRAN
-Date/Publication: 2012-02-01 08:13:28
+Date/Publication: 2012-02-06 21:06:52
diff --git a/MD5 b/MD5
@@ -1,13 +1,13 @@
-ddc7511f73397d636730db60ce32729e *DESCRIPTION
+1a25d39223133ee17259798e2b9c61a8 *DESCRIPTION
 9983baa53027ec7ecc5c44dc94318515 *NAMESPACE
-d03d66aa92ac4d55e491bf8ba6f2ec98 *NEWS
-405da42ca9cd4386b3fa43e4870579cd *R/A.mat.R
+c049921a49fc22c3a9864722c8e3426c *NEWS
+3426a5afa2b38e987e53b354c267f488 *R/A.mat.R
 53311049c222a3c2bbe1c6d9f61c9a2d *R/GWA.R
 f9b894abb6a3158ebfabe61486dccd94 *R/kinship.BLUP.R
 e6a37c2566e88f73c91bd18120dd5923 *R/mixed.solve.R
 b74e4e9e2c643e550a83c5ee4eb7e905 *inst/CITATION
-c116db1d93b71eef37fbb6217373efb5 *man/A.mat.Rd
+fbaf1e01452ed907bdb412ff35f3b067 *man/A.mat.Rd
 89ab7c69f6c5dbfca37b5220dd14525b *man/GWA.Rd
 a6728dd999a0ec924bb570459fdca197 *man/kinship.BLUP.Rd
 8b32359363bbf005fb6ee26ea7dd6725 *man/mixed.solve.Rd
-222baaa2a8757d1268862fa222132467 *man/rrBLUP-package.Rd
+3610cf9fd22ccdf4a09cad051325d988 *man/rrBLUP-package.Rd
diff --git a/NEWS b/NEWS
@@ -1,3 +1,7 @@
+Changes in 3.4:
+
+* Additional features for A.mat.
+
 Changes in 3.3:
 
 * Improved handling of missing genotype data in A.mat.

diff --git a/R/A.mat.R b/R/A.mat.R
@@ -1,10 +1,23 @@
-A.mat <- function(G,min.MAF=NULL,max.missing=NULL,tol=0.01,n.core=1){
+A.mat <- function(G,min.MAF=NULL,max.missing=NULL,impute=TRUE,tol=0.02,n.core=1,return.G=FALSE){
 
+tcrossprod.na <- function(V,W) {
+    crossprod.na <- function(x1, x2) {
+        mean(x1 * x2, na.rm = TRUE)
+    }
+    n1 <- nrow(V)
+    n2 <- nrow(W)
+    result <- matrix(0,n1,n2)
+    for (i in 1:n1) {
+    	result[i,] <- apply(W,1,crossprod.na,V[i,])
+    }
+    return(result)
+}
+
+
 impute2 <- function(W, cov.mat, mean.vec) {
 	n <- nrow(W)
 	m <- ncol(W)
 	S <- matrix(0,n,n)
-	W.sum <- rep(0,n)
 	for (i in 1:m) {
 		Wi <- matrix(W[,i],n,1)
 		missing <- which(is.na(Wi))
@@ -15,11 +28,11 @@ impute2 <- function(W, cov.mat, mean.vec) {
 			C <- cov.mat[missing,missing] - crossprod(cov.mat[not.NA,missing],Bt)
 			D <- tcrossprod(Wi)
 			D[missing,missing] <- D[missing,missing] + C
+			W[,i] <- Wi
 		} else {D <- tcrossprod(Wi)}
-		W.sum <- W.sum + Wi
 		S <- S + D
 	}	
-	return(list(S=S,W.sum=W.sum))
+	return(list(S=S,W.imp=W))
 }
 
 n <- nrow(G)
@@ -33,28 +46,56 @@ markers <- which((MAF >= min.MAF)&(frac.missing <= max.missing))
 m <- length(markers)
 sig.A <- sqrt(2 * mean(freq[markers] * (1 - freq[markers])))
 one <- matrix(1, n, 1)
+
+mono <- which(freq*(1-freq)==0)
+G[,mono] <- tcrossprod(one,matrix(freq[mono],length(mono),1))
+
 freq.mat <- tcrossprod(one, matrix(freq[markers], m, 1))
 W <- (G[, markers] + 1 - 2 *freq.mat )/sig.A
+
 if (!missing) {
-	return(tcrossprod(W)/m)
+	A <- tcrossprod(W)/m
+	rownames(A) <- rownames(G)
+	return(A)
 } else {
+    if (!impute) {
+       if (n.core > 1) {
+          library(multicore)
+          it <- split(1:n,factor(cut(1:n,n.core,labels=FALSE)))
+          A.list <- mclapply(it,function(ix){tcrossprod.na(W[ix,],W)})
+          A <- numeric(0)
+          for (i in 1:n.core) {A <- rbind(A,A.list[[i]])}
+       } else {
+          A <- tcrossprod.na(W,W)
+	   }	
+       rownames(A) <- rownames(G)
+       return(A)
+    } else {
+    #impute = TRUE	
     isna <- which(is.na(W)) 
 	W[isna] <- 0
 	if (m < n) {
 		print("There are fewer markers than lines: imputing with population means.")
 	}
-	if ((m < n)|(tol < 0)) {return(tcrossprod(W)/m)}
+	if ((m < n)|(tol < 0)) {
+		A <- tcrossprod(W)/m
+		rownames(A) <- rownames(G)
+		if (return.G) {
+			G[,markers] <- W*sig.A - 1 + 2*freq.mat
+			return(list(A=A,G.imp=G))		
+		} else {
+			return(A)
+		}
+	}
 
 	#do EM algorithm
 	mean.vec.new <- matrix(rowMeans(W),n,1)
 	cov.mat.new <- cov(t(W))
     W[isna] <- NA
 	A.new <- cov.mat.new + tcrossprod(mean.vec.new)
 	err <- tol+1
-	count <- 0
 	print("A.mat converging:")
 	while (err >= tol) {
-		count <- count + 1
 		A.old <- A.new
 		cov.mat.old <- cov.mat.new
 		mean.vec.old <- mean.vec.new
@@ -68,18 +109,25 @@ if (!missing) {
 		}
 		n.pieces <- length(pieces)
 		S <- matrix(0,n,n)
-		W.sum <- rep(0,n)
+		W.imp <- numeric(0)
 		for (i in 1:n.pieces) {
 			S <- S + pieces[[i]]$S
-			W.sum <- W.sum + pieces[[i]]$W.sum
+			W.imp <- cbind(W.imp,pieces[[i]]$W.imp)
 		}
-		mean.vec.new <- matrix(W.sum/m,n,1)
+		mean.vec.new <- matrix(rowMeans(W.imp),n,1)
 		cov.mat.new <- (S-tcrossprod(mean.vec.new)*m)/(m-1)
 		A.new <- cov.mat.new + tcrossprod(mean.vec.new)
 		err <- norm(A.old-A.new,type="F")/n
 		print(err,digits=3)
 	}
-	return(A.new)
-}
-}
+	rownames(A.new) <- rownames(G)
+	if (return.G) {
+		G[,markers] <- W.imp*sig.A - 1 + 2*freq.mat
+		return(list(A=A.new,G.imp=G))
+	} else {
+		return(A.new)
+	}
+  	} #else IMPUTE
+} #else MISSING
+} #A.mat
 
diff --git a/man/A.mat.Rd b/man/A.mat.Rd
@@ -8,7 +8,7 @@ Additive relationship matrix
 Calculates the realized additive relationship matrix.
 }
 \usage{
-A.mat(G,min.MAF=NULL,max.missing=NULL,tol=0.01,n.core=1)
+A.mat(G,min.MAF=NULL,max.missing=NULL,impute=TRUE,tol=0.02,n.core=1,return.G=FALSE)
 }
 
 \arguments{
@@ -22,20 +22,32 @@ Minimum minor allele frequency; default removes monomorphic markers.
 \item{max.missing}{
 Maximum proportion of missing data; default removes completely missing markers.
 }
+\item{impute}{
+If TRUE, missing genotypic data are imputed (see below).  If FALSE, A is calculated from pairwise complete observations, which does not guarantee positive semidefiniteness (this can cause problems with \code{\link{mixed.solve}}).
+}
 \item{tol}{
-Specifies convergence criterion for EM algorithm when there is missing data (see details). If tol < 0, missing data are imputed with the population mean.
+Specifies convergence criterion for imputing missing data with an EM algorithm (see details). If tol < 0, missing data are imputed with the population mean for each marker.
 }
 \item{n.core}{
 For Mac, Linux, and UNIX users, setting n.core > 1 will enable parallel execution on a machine with multiple cores. R package multicore must be installed for this to work.  Do not run multicore from within the R GUI; you must use the command line.   
 }
+\item{return.G}{
+If TRUE (and impute = TRUE), the imputed marker matrix is returned.  When the EM algorithm is used, the imputed alleles can lie outside the interval [-1,1].  Polymorphic markers that do not meet the min.MAF and max.missing criteria are not imputed.
+}
 }
 \details{
 The A matrix is calculated as \eqn{W W'/c}, where \eqn{W_{ik} = G_{ik} + 1 - 2 p_k} and \eqn{p_k} is the frequency of the 1 allele at marker k.  The normalization constant is \eqn{c = 2 \sum_k {p_k (1-p_k)}}.  
 
-When marker data are missing, an EM algorithm is used to impute genotypes and converge on the maximum likelihood solution for A.  The EM algorithm stops at iteration t when the RMS error = \eqn{n^{-1} \|A_{t} - A_{t-1}\|_2} < tol.  If the user passes a negative value for tol, or if the number of markers is less than the number of lines, missing alleles are imputed with the population mean for each marker.
+When marker data are missing, by default an EM algorithm is used to impute genotypes and converge on the maximum likelihood solution for A.  The EM algorithm stops at iteration t when the RMS error = \eqn{n^{-1} \|A_{t} - A_{t-1}\|_2} < tol.  If the user passes a negative value for tol, or if the number of markers is less than the number of lines, missing alleles are imputed with the population mean for each marker.
 }
 \value{
-\eqn{n \times n} additive relationship matrix
+If return.G = FALSE, the \eqn{n \times n} additive relationship matrix is returned.
+
+If return.G = TRUE, a list containing
+\describe{
+\item{A}{the A matrix}
+\item{G.imp}{the imputed G matrix}
+}
 }
 \examples{
 #random population of 200 lines with 1000 markers

diff --git a/man/rrBLUP-package.Rd b/man/rrBLUP-package.Rd
@@ -11,7 +11,7 @@ Use function \code{\link{GWA}} for association mapping.
 A number of improvements have been made concerning the handling of missing data since the original publication of the package.  The functions \code{\link{mixed.solve}}, \code{\link{kinship.BLUP}}, and \code{\link{GWA}} will tolerate missing phenotypic data: those observations are simply not used.  When genotypic data are missing, both \code{\link{kinship.BLUP}} (option "RR") and \code{\link{GWA}} rely on the EM algorithm in \code{\link{A.mat}}, which can also be used in conjunction with \code{\link{mixed.solve}}. The non-additive kernels in \code{\link{kinship.BLUP}} are based on \code{\link{dist}}, which uses pairwise complete observations.  
 }
 \section{Parallel computing}{
-For Mac, Linux, and UNIX users, R package multicore can be used in conjunction with rrBLUP to take advantage of multiple processors on a single machine.  For large data sets, especially when there is missing data, I highly recommend trying out this feature, which is available with \code{\link{kinship.BLUP}}, \code{\link{A.mat}}, and \code{\link{GWA}}.  You need R >= 2.14.1 for this to work properly, and you must also use R from the command line (not the GUI).  
+For Mac, Linux, and UNIX users, R package multicore can be used in conjunction with rrBLUP to take advantage of multiple processors on a single machine.  For large data sets, especially when there is missing data, I recommend trying this feature, which is available with \code{\link{kinship.BLUP}}, \code{\link{A.mat}}, and \code{\link{GWA}}.  You need R >= 2.14.1 for this to work properly, and you must also use R from the command line (not the GUI).  
 }
 \references{
 Endelman, J.B. 2011. Ridge regression and other kernels for genomic selection with R package rrBLUP. Plant Genome 4:250-255.