diff --git a/DESCRIPTION b/DESCRIPTION index 7010d3a..ac40275 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,11 +1,12 @@ Package: rrBLUP Title: Ridge regression and other kernels for genomic selection -Version: 2.5 +Version: 2.7 Author: Jeffrey Endelman Maintainer: Jeffrey Endelman Description: One application is to estimate marker effects by ridge regression; alternatively, BLUPs can be calculated based on kinship. The genetic covariance between lines can be modeled using a marker-based, additive relationship matrix, or via Gaussian and exponential kernels. +Depends: Matrix License: GPL-3 LazyLoad: yes -Packaged: 2011-11-23 15:18:07 UTC; jeffendelman +Packaged: 2011-12-01 21:44:59 UTC; jeffendelman Repository: CRAN -Date/Publication: 2011-11-23 16:04:50 +Date/Publication: 2011-12-02 16:23:08 diff --git a/MD5 b/MD5 index 16fa3f5..a477471 100644 --- a/MD5 +++ b/MD5 @@ -1,13 +1,13 @@ -ebd0f441dd7cc78ac5317311a1315f55 *DESCRIPTION -725cf45b6b4fc35a46619125c9a56ed5 *NAMESPACE -c0b87a2036de8c52f506910c514b523e *R/A.mat.R +7ffecd7c7d19252b56998ccd147228c4 *DESCRIPTION +cb2bbc1c5b8ede3c305c7f88a3534ab4 *NAMESPACE +1b905944441d29705c33c76a49c037b0 *R/A.mat.R ed40c1ee328c5c1a360f8f19e1c3da6f *R/GWA.R 1eb18cf7496f7a40675c50be85207f5b *R/impute.R -12d0dfead32ae78381bb56fe1b09573e *R/kinship.BLUP.R -2a44ea8e534f08600a1cb26458a8ae28 *R/mixed.solve.R +17386c1cd7a17192e8fd416a61490556 *R/kinship.BLUP.R +e6a37c2566e88f73c91bd18120dd5923 *R/mixed.solve.R b74e4e9e2c643e550a83c5ee4eb7e905 *inst/CITATION -ac7f617ca9597ea9a6c7dae0442cf0b6 *man/A.mat.Rd -be5fbfb20ec993da916e34e007aabbe0 *man/GWA.Rd -5f2221a320a663d862e94a88b6b6f8db *man/impute.Rd -4c7aa162fd21063299e894fed121f018 *man/kinship.BLUP.Rd -db24d688f1817159d86971b2500c5a54 *man/mixed.solve.Rd +cdbdc329a3d94433d03b1cfbc9b4fe22 *man/A.mat.Rd +30f584df388d3bd33156b8b0d23c3f1e *man/GWA.Rd +10f9448d8e12fd7ae964728e953fb2aa *man/impute.Rd +9e7862d767ca15b2e4416b9bce8254b0 *man/kinship.BLUP.Rd +8b32359363bbf005fb6ee26ea7dd6725 *man/mixed.solve.Rd diff --git a/NAMESPACE b/NAMESPACE index b0bcc42..234cbaa 100755 --- a/NAMESPACE +++ b/NAMESPACE @@ -1 +1,2 @@ +importFrom(Matrix,nearPD) export(mixed.solve,impute,GWA,kinship.BLUP,A.mat) diff --git a/R/A.mat.R b/R/A.mat.R old mode 100755 new mode 100644 index 414fc25..63adec1 --- a/R/A.mat.R +++ b/R/A.mat.R @@ -1,22 +1,60 @@ -A.mat <- function(G,method,min.MAF=0.01) { - method = toupper(method) - freq <- colMeans(G+1)/2 - markers <- which((freq >= min.MAF)&(1-freq >= min.MAF)) - freq <- freq[markers] - G <- G[,markers] - m <- ncol(G) - n <- nrow(G) - - if (method=="IBS") { - A <- tcrossprod(G)/m+1 - } else if (method=="UAR") { - X <- G + 1 - Y <- t(t(X)/sqrt(2*freq*(1-freq))) - c <- sqrt(2*freq/(1-freq)) - z <- matrix(colSums(t(Y)*c),n,1) - one <- matrix(rep(1,n),n,1) - q <- sum(c^2) - A <- (tcrossprod(Y)-tcrossprod(one,z)-tcrossprod(z,one)+tcrossprod(one)*q)/m - } else {stop("Invalid method")} - A -} +A.mat <- function(G,method="IBS",min.MAF=0.01,max.missing=1,PD=TRUE) { + + crossprod.na <- function(x1,x2) { + mean(x1 * x2,na.rm=TRUE) + } + tcrossprod.na <- function(G) { + apply(G,1,function(x,G){apply(G,1,crossprod.na,x)},G) + } + + method <- toupper(method) + n <- nrow(G) + + if (length(which(is.na(G))) > 0) { + + freq <- apply(G+1,2,function(x){mean(x,na.rm=TRUE)})/2 + frac.missing <- apply(G,2,function(x){length(which(is.na(x)))})/n + markers <- which((freq >= min.MAF)&(1-freq >= min.MAF)&(frac.missing < max.missing)) + freq <- freq[markers] + G <- G[,markers] + m <- ncol(G) + + if (method=="IBS") { + A <- tcrossprod.na(G)+1 + } else if (method=="UAR") { + X <- G + 1 + Y <- t(t(X)/sqrt(2*freq*(1-freq))) + c <- sqrt(2*freq/(1-freq)) + z <- matrix(colMeans(t(Y)*c,na.rm=TRUE),n,1) + one <- matrix(rep(1,n),n,1) + q <- mean(c^2) + A <- tcrossprod.na(Y)-tcrossprod(one,z)-tcrossprod(z,one)+tcrossprod(one)*q + } else {stop("Invalid method")} + + if (PD) { + as.matrix(nearPD(A)$mat) + } else {A} + + } else { + + freq <- colMeans(G+1)/2 + markers <- which((freq >= min.MAF)&(1-freq >= min.MAF)) + freq <- freq[markers] + G <- G[,markers] + m <- ncol(G) + + if (method=="IBS") { + A <- tcrossprod(G)/m+1 + } else if (method=="UAR") { + X <- G + 1 + Y <- t(t(X)/sqrt(2*freq*(1-freq))) + c <- sqrt(2*freq/(1-freq)) + z <- matrix(colSums(t(Y)*c),n,1) + one <- matrix(rep(1,n),n,1) + q <- sum(c^2) + A <- (tcrossprod(Y)-tcrossprod(one,z)-tcrossprod(z,one)+tcrossprod(one)*q)/m + } else {stop("Invalid method")} + + A + } # if missing data +} diff --git a/R/kinship.BLUP.R b/R/kinship.BLUP.R index 9c1ae3e..f6cad78 100755 --- a/R/kinship.BLUP.R +++ b/R/kinship.BLUP.R @@ -41,7 +41,7 @@ Z <- cbind(Z.train,matrix(rep(0,n.obs*n.pred),n.obs,n.pred)) G <- rbind(G.train,G.pred) if (K.method == "RR") { - K <- tcrossprod(G,G)/m + K <- A.mat(G,min.MAF=0) soln <- mixed.solve(y=y,X=X,Z=Z,K=K,method=mixed.method) if (n.pred > 0) { list(g.train=soln$u[1:n.train],g.pred=soln$u[n.train+1:n.pred],beta=soln$beta) @@ -75,4 +75,4 @@ if (K.method == "RR") { } } #if K.method -} #function \ No newline at end of file +} #function diff --git a/R/mixed.solve.R b/R/mixed.solve.R index 042b1b8..cdaa0f6 100755 --- a/R/mixed.solve.R +++ b/R/mixed.solve.R @@ -1,7 +1,10 @@ -mixed.solve <- function (y, Z, K = NULL, X = NULL, method = "REML", bounds = c(1e-09,1e+09), SE = FALSE, return.Hinv = FALSE) { +mixed.solve <- function (y, Z = NULL, K = NULL, X = NULL, method = "REML", bounds = c(1e-09,1e+09), SE = FALSE, return.Hinv = FALSE) { pi <- 3.14159 n <- length(y) y <- matrix(y,n,1) + +not.NA <- which(!is.na(y)) + if (is.null(X)) { p <- 1 X <- matrix(rep(1,n),n,1) @@ -11,6 +14,9 @@ if (is.null(p)) { p <- 1 X <- matrix(X,length(X),1) } +if (is.null(Z)) { +Z <- diag(n) +} m <- ncol(Z) if (is.null(m)) { m <- 1 @@ -22,6 +28,12 @@ if (!is.null(K)) { stopifnot(nrow(K) == m) stopifnot(ncol(K) == m) } + +Z <- Z[not.NA,] +X <- X[not.NA,] +n <- length(not.NA) +y <- matrix(y[not.NA],n,1) + XtX <- crossprod(X, X) rank.X <- qr(XtX)$rank stopifnot(p == rank.X) @@ -133,4 +145,4 @@ if (!SE) { list(Vu = Vu.opt, Ve = Ve.opt, beta = as.vector(beta), beta.SE = as.vector(beta.SE), u = as.vector(u), u.SE = as.vector(u.SE), LL = LL) } } -} \ No newline at end of file +} diff --git a/man/A.mat.Rd b/man/A.mat.Rd index 0a52c1c..4e63581 100755 --- a/man/A.mat.Rd +++ b/man/A.mat.Rd @@ -8,13 +8,13 @@ Additive relationship matrix Calculates an additive relationship matrix. } \usage{ -A.mat(G,method,min.MAF=0.01) +A.mat(G,method="IBS",min.MAF=0.01,max.missing=1,PD=TRUE) } \arguments{ \item{G}{ Matrix (\eqn{n \times m}) of unphased genotypes for \eqn{n} lines and \eqn{m} biallelic markers, -coded as \{-1,0,1\} = \{aa,Aa,AA\}. Missing data are not allowed (\code{\link{impute}} first). +coded as \{-1,0,1\} = \{aa,Aa,AA\}. Fractional (imputed) and missing values (NA) are allowed. } \item{method}{ Must be either "IBS" or "UAR", depending on what reference population is desired (see details). @@ -22,11 +22,19 @@ Must be either "IBS" or "UAR", depending on what reference population is desired \item{min.MAF}{ Minimum minor allele frequency; default is 0.01. } +\item{max.missing}{ +Maximum proportion of entries that can be missing for each marker. Markers above this limit are not used. +} +\item{PD}{ +This option only applies when alleles are missing, in which case A is not positive semidefinite by construction. If PD=TRUE, the function \code{\link{nearPD}} is used to return the closest positive definite matrix, which can then be used in \code{\link{mixed.solve}} +} } \details{ For method "IBS", \eqn{A = 1 + G G' / m }, which means the coefficient of coancestry equals the identity-by-state (IBS). The assumption that two alleles IBS are also identical by descent (IBD) is consistent with coalescence theory, in which all alleles are derived from a common ancestor but at different times in the past (Powell et al., 2010). For method "UAR" (Unified Additive Relationship), \eqn{A = W W'/m}, where \eqn{W_{ij} = (X_{ij} - 2 p_i)/\sqrt{2 p_i (1 - p_i)}} and \eqn{X = G + 1}. This \eqn{A} matrix has the property that the average of its off-diagonal elements is zero, which can be interpreted as meaning the IBD relationships are relative to the current population as the reference population (Powell et al., 2010). In the raw UAR model of Powell et al. (2010), the diagonal elements of \eqn{A} are calculated as \eqn{1 + F}, where \eqn{F} is the correlation between the two gametes in a diploid individual. This model, although unbiased, is in general not positive semi-definite. The diagonal elements of \eqn{A = W W'/m} are slightly different than the raw UAR model of Powell et al. (2010), but the two models have the same average F for the population. + +When alleles are missing, each pairwise calculation is based only on the markers that are present for both lines. } \value{ \eqn{n \times n} additive relationship matrix @@ -42,6 +50,6 @@ for (i in 1:200) { } #Additive relationship matrix -A <- A.mat(G,method="UAR") +A <- A.mat(G) } diff --git a/man/GWA.Rd b/man/GWA.Rd index 43a8a08..8086cbf 100755 --- a/man/GWA.Rd +++ b/man/GWA.Rd @@ -21,7 +21,7 @@ GWA(y, G, Z=NULL, X=NULL, K=NULL, min.MAF=0.05, \arguments{ \item{y}{ -Vector (\eqn{n \times 1}) of observations +Vector (\eqn{n \times 1}) of observations. Missing values (NA) are omitted. } \item{G}{ Matrix (\eqn{t \times m}) of genotypes for \eqn{t} lines with \eqn{m} bi-allelic markers. @@ -56,7 +56,7 @@ The use of a minimum MAF is typically adequate to ensure the problem is well-pos if an error message indicates the problem is singular, set check.rank to TRUE. This will slow down the algorithm but should fix the error. -Missing data are not allowed (\code{\link{impute}} first). +Missing marker data are not allowed (\code{\link{impute}} first). } \value{ Returns \eqn{m \times 1} vector of the marker scores, which equal \eqn{-log_{10}}(p-value) diff --git a/man/impute.Rd b/man/impute.Rd index 4fdc82d..cbfe6f1 100755 --- a/man/impute.Rd +++ b/man/impute.Rd @@ -19,7 +19,7 @@ Matrix (\eqn{n \times m}) of unphased genotypes for \eqn{n} lines and \eqn{m} ma Number of nearest neighbors to use for imputation. If not specified, the population mean is used. } \item{D}{ -Distance matrix (\eqn{n \times n}) between lines. If not specified, calculated by \code{dist}. +Distance matrix (\eqn{n \times n}) between lines. If not specified, calculated by \code{\link{dist}}. } } \details{ diff --git a/man/kinship.BLUP.Rd b/man/kinship.BLUP.Rd index 5d40920..8c9d7d4 100755 --- a/man/kinship.BLUP.Rd +++ b/man/kinship.BLUP.Rd @@ -14,15 +14,15 @@ kinship.BLUP(y, G.train, G.pred=NULL, X=NULL, Z.train=NULL, \arguments{ \item{y}{ -Vector (\eqn{n.obs \times 1}) of observations +Vector (\eqn{n.obs \times 1}) of observations. Missing values (NA) are omitted (see \code{\link{mixed.solve}}). } \item{G.train}{ Matrix (\eqn{n.train \times m}) of unphased genotypes for the training population: \eqn{n.train} lines with \eqn{m} bi-allelic markers. -Genotypes should be coded as \{-1,0,1\} = \{aa,Aa,AA\}; fractional (imputed) alleles are allowed. +Genotypes should be coded as \{-1,0,1\} = \{aa,Aa,AA\}; fractional (imputed) and missing (NA) alleles are allowed. } \item{G.pred}{ Matrix (\eqn{n.pred \times m}) of unphased genotypes for the prediction population: \eqn{n.pred} lines with \eqn{m} bi-allelic markers. -Genotypes should be coded as \{-1,0,1\} = \{aa,Aa,AA\}; fractional (imputed) alleles are allowed. +Genotypes should be coded as \{-1,0,1\} = \{aa,Aa,AA\}; fractional (imputed) and missing (NA) alleles are allowed. } \item{X}{ Design matrix (\eqn{n.obs \times p}) of fixed effects. If not passed, a vector of 1's is used @@ -33,8 +33,7 @@ to model the intercept. the identity matrix is used. } \item{K.method}{ -"RR" (default) is ridge regression (\eqn{K = G G'/m}), "GAUSS" is a Gaussian kernel (\eqn{K = e^{-D^2/\theta^2}}) and -"EXP" is an exponential kernel (\eqn{K = e^{-D/\theta}}), where Euclidean distances \eqn{D} are calculated with \code{dist}. +"RR" (default) is ridge regression, for which K is the identity-by-state additive relationship matrix computed with \code{\link{A.mat}}. The option "GAUSS" is a Gaussian kernel (\eqn{K = e^{-D^2/\theta^2}}) and "EXP" is an exponential kernel (\eqn{K = e^{-D/\theta}}), where Euclidean distances \eqn{D} are computed with \code{\link{dist}}. } \item{n.profile}{ @@ -44,9 +43,6 @@ For K.method = "GAUSS" or "EXP", the number of points to use in the log-likeliho Either "REML" (default) or "ML". } } -\details{ -Missing alleles (NA) are permitted with GAUSS and EXP but not with RR (\code{\link{impute}} first). -} \value{ \describe{ \item{$g.train}{BLUP solution for the training set} @@ -79,7 +75,7 @@ y <- g + rnorm(200,mean=0,sd=sqrt((1-h2)/h2*var(g))) #split in half for training and prediction train <- 1:100 -pred <- 1:100 +pred <- 101:200 ans <- kinship.BLUP(y=y[train],G.train=G[train,],G.pred=G[pred,],K.method="GAUSS") #correlation accuracy diff --git a/man/mixed.solve.Rd b/man/mixed.solve.Rd index 1355e5e..ed85235 100755 --- a/man/mixed.solve.Rd +++ b/man/mixed.solve.Rd @@ -15,16 +15,16 @@ of mixed models, in which there is a single variance component other than the re has a close relationship with ridge regression (ridge parameter \eqn{\lambda = \sigma_e^2 / \sigma^2_u}). } \usage{ -mixed.solve(y, Z, K=NULL, X=NULL, method="REML", +mixed.solve(y, Z=NULL, K=NULL, X=NULL, method="REML", bounds=c(1e-09, 1e+09), SE=FALSE, return.Hinv=FALSE) } \arguments{ \item{y}{ -Vector (\eqn{n \times 1}) of observations. +Vector (\eqn{n \times 1}) of observations. Missing values (NA) are omitted, along with the corresponding rows of X and Z. } \item{Z}{ -Design matrix (\eqn{n \times m}) for the random effects. +Design matrix (\eqn{n \times m}) for the random effects. If not passed, assumed to be the identity matrix. } \item{K}{ Covariance matrix (\eqn{m \times m}) for random effects; must be positive semi-definite. If not passed, assumed to @@ -99,7 +99,7 @@ ans <- mixed.solve(y,Z=G) #By default K = I accuracy <- cor(u,ans$u) #predict breeding values -ans <- mixed.solve(y,Z=diag(200),K=tcrossprod(G,G)) +ans <- mixed.solve(y,K=A.mat(G)) accuracy <- cor(g,ans$u) }