version 2.7

cran · Dec 1, 2011 · e100c93 · e100c93
1 parent fa3c345
commit e100c93
Show file tree

Hide file tree

Showing 11 changed files with 114 additions and 58 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,11 +1,12 @@
 Package: rrBLUP
 Title: Ridge regression and other kernels for genomic selection
-Version: 2.5
+Version: 2.7
 Author: Jeffrey Endelman 
 Maintainer: Jeffrey Endelman <j.endelman@gmail.com>
 Description: One application is to estimate marker effects by ridge regression; alternatively, BLUPs can be calculated based on kinship.  The genetic covariance between lines can be modeled using a marker-based, additive relationship matrix, or via Gaussian and exponential kernels.  
+Depends: Matrix
 License: GPL-3
 LazyLoad: yes
-Packaged: 2011-11-23 15:18:07 UTC; jeffendelman
+Packaged: 2011-12-01 21:44:59 UTC; jeffendelman
 Repository: CRAN
-Date/Publication: 2011-11-23 16:04:50
+Date/Publication: 2011-12-02 16:23:08
diff --git a/MD5 b/MD5
@@ -1,13 +1,13 @@
-ebd0f441dd7cc78ac5317311a1315f55 *DESCRIPTION
-725cf45b6b4fc35a46619125c9a56ed5 *NAMESPACE
-c0b87a2036de8c52f506910c514b523e *R/A.mat.R
+7ffecd7c7d19252b56998ccd147228c4 *DESCRIPTION
+cb2bbc1c5b8ede3c305c7f88a3534ab4 *NAMESPACE
+1b905944441d29705c33c76a49c037b0 *R/A.mat.R
 ed40c1ee328c5c1a360f8f19e1c3da6f *R/GWA.R
 1eb18cf7496f7a40675c50be85207f5b *R/impute.R
-12d0dfead32ae78381bb56fe1b09573e *R/kinship.BLUP.R
-2a44ea8e534f08600a1cb26458a8ae28 *R/mixed.solve.R
+17386c1cd7a17192e8fd416a61490556 *R/kinship.BLUP.R
+e6a37c2566e88f73c91bd18120dd5923 *R/mixed.solve.R
 b74e4e9e2c643e550a83c5ee4eb7e905 *inst/CITATION
-ac7f617ca9597ea9a6c7dae0442cf0b6 *man/A.mat.Rd
-be5fbfb20ec993da916e34e007aabbe0 *man/GWA.Rd
-5f2221a320a663d862e94a88b6b6f8db *man/impute.Rd
-4c7aa162fd21063299e894fed121f018 *man/kinship.BLUP.Rd
-db24d688f1817159d86971b2500c5a54 *man/mixed.solve.Rd
+cdbdc329a3d94433d03b1cfbc9b4fe22 *man/A.mat.Rd
+30f584df388d3bd33156b8b0d23c3f1e *man/GWA.Rd
+10f9448d8e12fd7ae964728e953fb2aa *man/impute.Rd
+9e7862d767ca15b2e4416b9bce8254b0 *man/kinship.BLUP.Rd
+8b32359363bbf005fb6ee26ea7dd6725 *man/mixed.solve.Rd
diff --git a/NAMESPACE b/NAMESPACE
@@ -1 +1,2 @@
+importFrom(Matrix,nearPD)
 export(mixed.solve,impute,GWA,kinship.BLUP,A.mat)
diff --git a/R/A.mat.R b/R/A.mat.R
@@ -1,22 +1,60 @@
-A.mat <- function(G,method,min.MAF=0.01) {
-  method = toupper(method)
-  freq <- colMeans(G+1)/2
-  markers <- which((freq >= min.MAF)&(1-freq >= min.MAF))
-  freq <- freq[markers]
-  G <- G[,markers]
-  m <- ncol(G)
-  n <- nrow(G)
-
-  if (method=="IBS") {
-  A <- tcrossprod(G)/m+1
-  } else if (method=="UAR") {
-  X <- G + 1
-  Y <- t(t(X)/sqrt(2*freq*(1-freq)))
-  c <- sqrt(2*freq/(1-freq))
-  z <- matrix(colSums(t(Y)*c),n,1)
-  one <- matrix(rep(1,n),n,1)
-  q <- sum(c^2)
-  A <- (tcrossprod(Y)-tcrossprod(one,z)-tcrossprod(z,one)+tcrossprod(one)*q)/m
-  } else {stop("Invalid method")}
-  A
-}
+A.mat <- function(G,method="IBS",min.MAF=0.01,max.missing=1,PD=TRUE) {
+
+	crossprod.na <- function(x1,x2) {
+		mean(x1 * x2,na.rm=TRUE)		
+	}
+	tcrossprod.na <- function(G) {
+	     apply(G,1,function(x,G){apply(G,1,crossprod.na,x)},G)
+	}
+
+	method <- toupper(method)
+	n <- nrow(G)
+
+	if (length(which(is.na(G))) > 0) {
+
+	freq <- apply(G+1,2,function(x){mean(x,na.rm=TRUE)})/2
+	frac.missing <- apply(G,2,function(x){length(which(is.na(x)))})/n
+	markers <- which((freq >= min.MAF)&(1-freq >= min.MAF)&(frac.missing < max.missing))
+  	freq <- freq[markers]
+  	G <- G[,markers]
+  	m <- ncol(G)
+
+	if (method=="IBS") {
+		A <- tcrossprod.na(G)+1
+	} else if (method=="UAR") {
+		X <- G + 1
+		Y <- t(t(X)/sqrt(2*freq*(1-freq)))
+		c <- sqrt(2*freq/(1-freq))
+		z <- matrix(colMeans(t(Y)*c,na.rm=TRUE),n,1)
+		one <- matrix(rep(1,n),n,1)
+		q <- mean(c^2)
+		A <- tcrossprod.na(Y)-tcrossprod(one,z)-tcrossprod(z,one)+tcrossprod(one)*q
+	} else {stop("Invalid method")}
+
+    if (PD) {
+     as.matrix(nearPD(A)$mat)
+	} else {A}
+
+	} else {
+
+	freq <- colMeans(G+1)/2
+	markers <- which((freq >= min.MAF)&(1-freq >= min.MAF))
+	freq <- freq[markers]
+	G <- G[,markers]
+	m <- ncol(G)
+
+	if (method=="IBS") {
+		A <- tcrossprod(G)/m+1
+	} else if (method=="UAR") {
+		X <- G + 1
+		Y <- t(t(X)/sqrt(2*freq*(1-freq)))
+		c <- sqrt(2*freq/(1-freq))
+		z <- matrix(colSums(t(Y)*c),n,1)
+		one <- matrix(rep(1,n),n,1)
+		q <- sum(c^2)
+		A <- (tcrossprod(Y)-tcrossprod(one,z)-tcrossprod(z,one)+tcrossprod(one)*q)/m
+	} else {stop("Invalid method")}		
+
+    A  
+  } # if missing data	
+}
diff --git a/R/kinship.BLUP.R b/R/kinship.BLUP.R
@@ -41,7 +41,7 @@ Z <- cbind(Z.train,matrix(rep(0,n.obs*n.pred),n.obs,n.pred))
 G <- rbind(G.train,G.pred)
 
 if (K.method == "RR") {
-   K <- tcrossprod(G,G)/m
+   K <- A.mat(G,min.MAF=0)
    soln <- mixed.solve(y=y,X=X,Z=Z,K=K,method=mixed.method)
    if (n.pred > 0) {
      list(g.train=soln$u[1:n.train],g.pred=soln$u[n.train+1:n.pred],beta=soln$beta)
@@ -75,4 +75,4 @@ if (K.method == "RR") {
   }
 
 } #if K.method
-} #function 
+} #function 
diff --git a/R/mixed.solve.R b/R/mixed.solve.R
@@ -1,7 +1,10 @@
-mixed.solve <- function (y, Z, K = NULL, X = NULL, method = "REML", bounds = c(1e-09,1e+09), SE = FALSE, return.Hinv = FALSE) {
+mixed.solve <- function (y, Z = NULL, K = NULL, X = NULL, method = "REML", bounds = c(1e-09,1e+09), SE = FALSE, return.Hinv = FALSE) {
 pi <- 3.14159
 n <- length(y)
 y <- matrix(y,n,1)
+
+not.NA <- which(!is.na(y))
+
 if (is.null(X)) {
 p <- 1
 X <- matrix(rep(1,n),n,1)
@@ -11,6 +14,9 @@ if (is.null(p)) {
 p <- 1
 X <- matrix(X,length(X),1)
 }
+if (is.null(Z)) {
+Z <- diag(n)
+}
 m <- ncol(Z)
 if (is.null(m)) {
 m <- 1
@@ -22,6 +28,12 @@ if (!is.null(K)) {
 	stopifnot(nrow(K) == m)
 	stopifnot(ncol(K) == m)
 }
+
+Z <- Z[not.NA,]
+X <- X[not.NA,]
+n <- length(not.NA)
+y <- matrix(y[not.NA],n,1)
+
 XtX <- crossprod(X, X)
 rank.X <- qr(XtX)$rank
 stopifnot(p == rank.X)
@@ -133,4 +145,4 @@ if (!SE) {
     list(Vu = Vu.opt, Ve = Ve.opt, beta = as.vector(beta), beta.SE = as.vector(beta.SE), u = as.vector(u), u.SE = as.vector(u.SE), LL = LL)
   }
 }
-}
+}
diff --git a/man/A.mat.Rd b/man/A.mat.Rd
@@ -8,25 +8,33 @@ Additive relationship matrix
 Calculates an additive relationship matrix.
 }
 \usage{
-A.mat(G,method,min.MAF=0.01)
+A.mat(G,method="IBS",min.MAF=0.01,max.missing=1,PD=TRUE)
 }
 
 \arguments{
   \item{G}{
 Matrix (\eqn{n \times m}) of unphased genotypes for \eqn{n} lines and \eqn{m} biallelic markers, 
-coded as \{-1,0,1\} = \{aa,Aa,AA\}.  Missing data are not allowed (\code{\link{impute}} first). 
+coded as \{-1,0,1\} = \{aa,Aa,AA\}. Fractional (imputed) and missing values (NA) are allowed.
 }
 \item{method}{
 Must be either "IBS" or "UAR", depending on what reference population is desired (see details).
 }
 \item{min.MAF}{
 Minimum minor allele frequency; default is 0.01.
 }
+\item{max.missing}{
+Maximum proportion of entries that can be missing for each marker.  Markers above this limit are not used.
+}
+\item{PD}{
+This option only applies when alleles are missing, in which case A is not positive semidefinite by construction.  If PD=TRUE, the function \code{\link{nearPD}} is used to return the closest positive definite matrix, which can then be used in \code{\link{mixed.solve}}
+}
 }
 \details{
 For method "IBS", \eqn{A = 1 + G G' / m }, which means the coefficient of coancestry equals the identity-by-state (IBS).  The assumption that two alleles IBS are also identical by descent (IBD) is consistent with coalescence theory, in which all alleles are derived from a common ancestor but at different times in the past (Powell et al., 2010).
 
 For method "UAR" (Unified Additive Relationship), \eqn{A = W W'/m}, where \eqn{W_{ij} = (X_{ij} - 2 p_i)/\sqrt{2 p_i (1 - p_i)}} and \eqn{X = G + 1}.  This \eqn{A} matrix has the property that the average of its off-diagonal elements is zero, which can be interpreted as meaning the IBD relationships are relative to the current population as the reference population (Powell et al., 2010).  In the raw UAR model of Powell et al. (2010), the diagonal elements of \eqn{A} are calculated as \eqn{1 + F}, where \eqn{F} is the correlation between the two gametes in a diploid individual.  This model, although unbiased, is in general not positive semi-definite.  The diagonal elements of \eqn{A = W W'/m} are slightly different than the raw UAR model of Powell et al. (2010), but the two models have the same average F for the population.
+
+When alleles are missing, each pairwise calculation is based only on the markers that are present for both lines.
 }
 \value{
 \eqn{n \times n} additive relationship matrix
@@ -42,6 +50,6 @@ for (i in 1:200) {
 }
 
 #Additive relationship matrix
-A <- A.mat(G,method="UAR")
+A <- A.mat(G)
 
 }
diff --git a/man/GWA.Rd b/man/GWA.Rd
@@ -21,7 +21,7 @@ GWA(y, G, Z=NULL, X=NULL, K=NULL, min.MAF=0.05,
 
 \arguments{
   \item{y}{
-Vector (\eqn{n \times 1}) of observations
+Vector (\eqn{n \times 1}) of observations.  Missing values (NA) are omitted.
 }
   \item{G}{
 Matrix (\eqn{t \times m}) of genotypes for \eqn{t} lines with \eqn{m} bi-allelic markers.
@@ -56,7 +56,7 @@ The use of a minimum MAF is typically adequate to ensure the problem is well-pos
 if an error message indicates the problem is singular, set check.rank to TRUE.  This will slow 
 down the algorithm but should fix the error.
 
-Missing data are not allowed (\code{\link{impute}} first).
+Missing marker data are not allowed (\code{\link{impute}} first).
 }
 \value{
 Returns \eqn{m \times 1} vector of the marker scores, which equal \eqn{-log_{10}}(p-value)

diff --git a/man/impute.Rd b/man/impute.Rd
@@ -19,7 +19,7 @@ Matrix (\eqn{n \times m}) of unphased genotypes for \eqn{n} lines and \eqn{m} ma
 Number of nearest neighbors to use for imputation.  If not specified, the population mean is used.
 }
   \item{D}{
-Distance matrix (\eqn{n \times n}) between lines.  If not specified, calculated by \code{dist}.
+Distance matrix (\eqn{n \times n}) between lines.  If not specified, calculated by \code{\link{dist}}.
 }
 }
 \details{

diff --git a/man/kinship.BLUP.Rd b/man/kinship.BLUP.Rd
@@ -14,15 +14,15 @@ kinship.BLUP(y, G.train, G.pred=NULL, X=NULL, Z.train=NULL,
 
 \arguments{
   \item{y}{
-Vector (\eqn{n.obs \times 1}) of observations
+Vector (\eqn{n.obs \times 1}) of observations.  Missing values (NA) are omitted (see \code{\link{mixed.solve}}).
 }
   \item{G.train}{
 Matrix (\eqn{n.train \times m}) of unphased genotypes for the training population: \eqn{n.train} lines with \eqn{m} bi-allelic markers.
-Genotypes should be coded as \{-1,0,1\} = \{aa,Aa,AA\}; fractional (imputed) alleles are allowed.  
+Genotypes should be coded as \{-1,0,1\} = \{aa,Aa,AA\}; fractional (imputed) and missing (NA) alleles are allowed.  
 }
   \item{G.pred}{
 Matrix (\eqn{n.pred \times m}) of unphased genotypes for the prediction population: \eqn{n.pred} lines with \eqn{m} bi-allelic markers.
-Genotypes should be coded as \{-1,0,1\} = \{aa,Aa,AA\}; fractional (imputed) alleles are allowed.  
+Genotypes should be coded as \{-1,0,1\} = \{aa,Aa,AA\}; fractional (imputed) and missing (NA) alleles are allowed.  
 }
   \item{X}{
 Design matrix (\eqn{n.obs \times p}) of fixed effects.  If not passed, a vector of 1's is used 
@@ -33,8 +33,7 @@ to model the intercept.
 the identity matrix is used.
 }
    \item{K.method}{
-"RR" (default) is ridge regression (\eqn{K = G G'/m}), "GAUSS" is a Gaussian kernel (\eqn{K = e^{-D^2/\theta^2}}) and 
-"EXP" is an exponential kernel (\eqn{K = e^{-D/\theta}}), where Euclidean distances \eqn{D} are calculated with \code{dist}.
+"RR" (default) is ridge regression, for which K is the identity-by-state additive relationship matrix computed with \code{\link{A.mat}}.  The option "GAUSS" is a Gaussian kernel (\eqn{K = e^{-D^2/\theta^2}}) and "EXP" is an exponential kernel (\eqn{K = e^{-D/\theta}}), where Euclidean distances \eqn{D} are computed with \code{\link{dist}}.
 }
  
   \item{n.profile}{
@@ -44,9 +43,6 @@ For K.method = "GAUSS" or "EXP", the number of points to use in the log-likeliho
 Either "REML" (default) or "ML".
 }
 }
-\details{
-Missing alleles (NA) are permitted with GAUSS and EXP but not with RR (\code{\link{impute}} first).
-}
 \value{
 \describe{
 \item{$g.train}{BLUP solution for the training set}
@@ -79,7 +75,7 @@ y <- g + rnorm(200,mean=0,sd=sqrt((1-h2)/h2*var(g)))
 
 #split in half for training and prediction
 train <- 1:100
-pred <- 1:100
+pred <- 101:200
 ans <- kinship.BLUP(y=y[train],G.train=G[train,],G.pred=G[pred,],K.method="GAUSS")
 
 #correlation accuracy

diff --git a/man/mixed.solve.Rd b/man/mixed.solve.Rd
@@ -15,16 +15,16 @@ of mixed models, in which there is a single variance component other than the re
 has a close relationship with ridge regression (ridge parameter \eqn{\lambda = \sigma_e^2 / \sigma^2_u}).
 }
 \usage{
-mixed.solve(y, Z, K=NULL, X=NULL, method="REML", 
+mixed.solve(y, Z=NULL, K=NULL, X=NULL, method="REML", 
         bounds=c(1e-09, 1e+09), SE=FALSE, return.Hinv=FALSE)
 }
 
 \arguments{
   \item{y}{
-Vector (\eqn{n \times 1}) of observations.
+Vector (\eqn{n \times 1}) of observations.  Missing values (NA) are omitted, along with the corresponding rows of X and Z. 
 }
   \item{Z}{
-Design matrix (\eqn{n \times m}) for the random effects.
+Design matrix (\eqn{n \times m}) for the random effects.  If not passed, assumed to be the identity matrix.
 }
   \item{K}{
 Covariance matrix (\eqn{m \times m}) for random effects; must be positive semi-definite.  If not passed, assumed to 
@@ -99,7 +99,7 @@ ans <- mixed.solve(y,Z=G)  #By default K = I
 accuracy <- cor(u,ans$u)
 
 #predict breeding values
-ans <- mixed.solve(y,Z=diag(200),K=tcrossprod(G,G))
+ans <- mixed.solve(y,K=A.mat(G))
 accuracy <- cor(g,ans$u)
 
 }