diff --git a/DESCRIPTION b/DESCRIPTION new file mode 100644 index 0000000..c05e962 --- /dev/null +++ b/DESCRIPTION @@ -0,0 +1,17 @@ +Package: FUNLDA +Type: Package +Title: Genomic Latent Dirichlet Allocation +Description: A tool for fitting latent Dirichlet allocation models to genomic annotation data. +Version: 1.0 +Date: 2017-06-26 +Author: Daniel Backenroth and Iuliana Ionita-Laza +Maintainer: Daniel Backenroth +License: GPL (>= 2) +Imports: Rcpp (>= 0.12.2) +LinkingTo: Rcpp, RcppArmadillo +LazyData: true +RoxygenNote: 6.0.1 +NeedsCompilation: yes +Packaged: 2017-07-06 13:40:52 UTC; db2175 +Repository: CRAN +Date/Publication: 2017-07-07 04:52:52 UTC diff --git a/MD5 b/MD5 new file mode 100644 index 0000000..21d9b75 --- /dev/null +++ b/MD5 @@ -0,0 +1,22 @@ +ef25f3000c5db96556f19d2a086bdd41 *DESCRIPTION +c0b219166fed468c36e095bbdf768d5c *NAMESPACE +0bd4c6e0488266fe4ccc30125308197e *R/FunLDA.R +b5d429f3bd5f5c95fa58bda9b7a64454 *R/RcppExports.R +628e08f4ae439d45a6ff1c2429bf6f86 *R/data.R +9999740f97072d628484816cf3488ade *R/ebmme.R +d0d253afd511ba598b6a1c3d5bfcc79f *R/help.R +a5d536425f704efb3fe4d9885ca18962 *data/tissuenew.RData +21d8c657a9091e5f3b9e19d8c00ee787 *data/training.RData +c4f122c5edd100b038a0cc2d5881bd75 *man/FUNLDA.Rd +48b1d099e90f47bd6f5e3d6e95cf42b2 *man/FitLDAModel.Rd +f65a367be37872ad280c022714e8efc4 *man/FitLDAModelNewTissues.Rd +141a347399ac43130bfa4429c60c9ce7 *man/Predict.Rd +f786b9357d4afaf6e500db982881aa42 *man/tissuenew.Rd +5a9b82d4629cc2bb4b2175ae5306d6ab *man/training.Rd +2a6f9e9e044a78154d3cfda5936d6f48 *src/Makevars +8d46d69896a1f1d31ed76035a0e49d67 *src/Makevars.win +33e80dc519fe3bbc4a571e252ea0d23a *src/RcppExports.cpp +695a279cd82d411d2cf09a00d458a346 *src/ebmme_cpp_mine_binned2.cpp +e0669d61e598b7ee6778834b65bd2da7 *src/packagename_init.c +6c4589ed2dfa308d9382601c2c04244d *src/utils.cpp +a612da8cd6d0799816296517d2888c14 *src/utils.h diff --git a/NAMESPACE b/NAMESPACE new file mode 100644 index 0000000..1b6a59e --- /dev/null +++ b/NAMESPACE @@ -0,0 +1,7 @@ +# Generated by roxygen2: do not edit by hand + +export(FitLDAModel) +export(FitLDAModelNewTissues) +export(Predict) +importFrom(Rcpp,sourceCpp) +useDynLib(FUNLDA, .registration = TRUE) diff --git a/R/FunLDA.R b/R/FunLDA.R new file mode 100644 index 0000000..6fa68df --- /dev/null +++ b/R/FunLDA.R @@ -0,0 +1,163 @@ +#' Predict posterior probabilities for variants to be in +#' each cluster in a fitted LDA model +#' +#' @param predict.data A data frame with character-valued columns +#' rs and cat and numeric-valued columns with annotations. +#' Each row contains data for one SNP in one tissue. +#' rs is an ID for the SNP, which need not be unique, and cat +#' is an ID for each tissue, which must match an ID +#' for which training data was included when fitting the +#' LDA model. Aannotation columns must have column names matching +#' those supplied when fitting the LDA model. +#' @param summary.c A fitted LDA model created using +#' \code{\link{FitLDAModel}} or \code{\link{FitLDAModelNewTissues}}. +#' @return p.labeled, a data frame with one row per training variant +#' with the posterior probability of each variant to be in each +#' each cluster (with column names CLUSTER1,...) and columns +#' cat and rs. +#' @examples +#' \dontrun{ +#' data(training) +#' summary.c <- FitLDAModel(training.data=training, nclust=3, +#' kde.nbins=100, iters=50, inner.iters=50) +#' pred <- Predict(predict.data=training, summary.c=summary.c) +#' } +#' @export +Predict <- function(predict.data, summary.c){ + predict.data <- as.data.frame(predict.data) + z <- predict.data[, colnames(summary.c$data)] + cat.dict <- summary.c$categories$ncat + names(cat.dict) <- summary.c$categories$cat + cat <- cat.dict[predict.data$cat] + pred <- ebmme.Predict(data=z, cat=cat, + block_id=summary.c$block.id, + nclust=summary.c$nclust, + levels=summary.c$levels.data, + fs_binned=summary.c$fs_binned, + a=summary.c$a) + p.labeled <- as.data.frame(t(pred$p)) + colnames(p.labeled) <- paste0("CLUSTER", 1:ncol(p.labeled)) + p.labeled <- cbind(p.labeled, predict.data[, c("cat", "rs")]) + return(p.labeled) +} + +#' Update the tissues in a fitted LDA model +#' +#' @param newtissue.data A data frame with character-valued columns +#' rs and cat and numeric-valued columns with annotations. +#' Each row contains data for one SNP in one tissue +#' rs is an ID for the SNP, which need not be unique, and +#' cat is an ID for each tissue, which must match an ID +#' for which training data was included when fitting the +#' LDA model. Annotation columns must have column names matching +#' those supplied when fitting the LDA model. +#' @inheritParams Predict +#' @inheritParams FitLDAModel +#' @return a fitted LDA model, as returned by \code{\link{FitLDAModel}}, +#' that can be used with \code{\link{Predict}} for +#' the tissues in newtissue.data +#' @examples +#' \dontrun{ +#' data(training) +#' summary.c <- FitLDAModel(training.data=training, nclust=3, +#' kde.nbins=100, iters=50, inner.iters=50) +#' data(tissuenew) +#' newtissues <- FitLDAModelNewTissues(newtissue.data=tissuenew, +#' summary.c=summary.c, +#' inner.iters=50) +#' } +#' @export +FitLDAModelNewTissues <- function(newtissue.data, summary.c, inner.iters=200){ + newtissue.data <- as.data.frame(newtissue.data) + z <- newtissue.data[, colnames(summary.c$data)] + cat <- newtissue.data$cat + ncat <- as.numeric(as.factor(cat)) - 1 + df <- unique(data.frame(cat=cat, ncat=ncat)) + pred <- ebmme.FitNewTissue(data=z, cat=ncat, + block_id=summary.c$block.id, + nclust=summary.c$nclust, + levels=summary.c$levels.data, + fs_binned=summary.c$fs_binned, + inner.iters=inner.iters) + p.labeled <- as.data.frame(t(pred$p)) + colnames(p.labeled) <- paste0("CLUSTER", 1:ncol(p.labeled)) + p.labeled <- cbind(p.labeled, newtissue.data[, c("cat", "rs")]) + a.labeled <- as.data.frame(t(pred$a)) + colnames(a.labeled) <- paste0("CLUSTER", 1:ncol(a.labeled)) + a.labeled$ncat <- 0:max(df$ncat) + a.labeled <- merge(a.labeled, df) + a.labeled <- a.labeled[, !colnames(a.labeled)=="ncat"] + summary.c$a <- pred$a + summary.c$p <- pred$p + summary.c$data <- z + summary.c$cat <- ncat + summary.c$all_as <- NULL + summary.c$data.labeled <- NULL + summary.c$categories <- df + summary.c$a.labeled <- a.labeled + summary.c$p.labeled <- p.labeled + return(summary.c) +} + +#' fit an LDA model +#' @param training.data A data frame with character-valued columns +#' rs and cat and numeric-valued columns with annotations. +#' Each row is data for one SNP in one tissue. +#' rs is an ID for the SNP, which need not be unique, and +#' cat is an ID for each tissue. +#' @param nclust Integer specifying the number of clusters to estimate +#' @param kde.nbins Integer specifying how many bins to use for binning +#' each annotation +#' @param iters Integer specifying number of outer iterations +#' @param inner.iters Integer specifying number of inner iterations +#' @return A fitted LDA model, i.e., a list (apart from elements +#' used internally) with elements +#' \describe{ +#' \item{p.labeled}{a data frame with one row per +#' training variant, with +#' the posterior probability for each +#' variant to be in +#' each cluster in columns CLUSTER1,... and +#' also with columns cat and rs} +#' \item{a.labeled}{a data frame with one row per tissue +#' with membership vectors for each tissue +#' with columns cat and CLUSTER1,...} +#' } +#' @examples +#' \dontrun{ +#' data(training) +#' summary.c <- FitLDAModel(training.data=training, nclust=3, +#' kde.nbins=100, iters=50, inner.iters=50) +#' } +#' @export +FitLDAModel <- function(training.data, nclust=9, kde.nbins=1000, iters=250, + inner.iters=200){ + training.data <- as.data.frame(training.data) + cat <- training.data$cat + ncat <- as.numeric(as.factor(cat)) - 1 + df <- unique(data.frame(cat=cat, ncat=ncat)) + training.data <- as.data.frame(training.data) + z <- training.data[, !colnames(training.data) %in% c("rs", "cat")] + summary.c <- ebmme.lda(data= z, cat=ncat, block.id = 1:ncol(z), + iters = iters, + inner.iters = inner.iters, nclust=nclust, + kde.nbins=kde.nbins) + # fix up p and a + training.data$order <- 1:nrow(training.data) + data.labeled <- merge(training.data, df) + data.labeled <- data.labeled[order(data.labeled$order), ] + data.labeled <- data.labeled[, !colnames(data.labeled) %in% "order"] + summary.c$data.labeled <- data.labeled + summary.c$categories <- df + a.labeled <- as.data.frame(t(summary.c$a)) + colnames(a.labeled) <- paste0("CLUSTER", 1:ncol(a.labeled)) + a.labeled$ncat <- 0:max(df$ncat) + a.labeled <- merge(a.labeled, df) + a.labeled <- a.labeled[, !colnames(a.labeled)=="ncat"] + summary.c$a.labeled <- a.labeled + p.labeled <- as.data.frame(t(summary.c$p)) + colnames(p.labeled) <- paste0("CLUSTER", 1:ncol(p.labeled)) + p.labeled <- cbind(p.labeled, data.labeled[, c("cat", "rs")]) + summary.c$p.labeled <- p.labeled + summary.c +} diff --git a/R/RcppExports.R b/R/RcppExports.R new file mode 100644 index 0000000..656a7ee --- /dev/null +++ b/R/RcppExports.R @@ -0,0 +1,14 @@ +# This file was generated by Rcpp::compileAttributes +# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 + +newtissue <- function(cat_, block_id_, nclust_, bins_, fs_binned_, alpha_, inner_iters_) { + .Call('FUNLDA_newtissue', PACKAGE = 'FUNLDA', cat_, block_id_, nclust_, bins_, fs_binned_, alpha_, inner_iters_) +} + +ebmme_cpp_binned <- function(dat_, cat_, block_id_, H1_inv_, p_, alpha_, iters_, inner_iters_, nclust_, bins_, bin_data_, kde_nbins_) { + .Call('FUNLDA_ebmme_cpp_binned', PACKAGE = 'FUNLDA', dat_, cat_, block_id_, H1_inv_, p_, alpha_, iters_, inner_iters_, nclust_, bins_, bin_data_, kde_nbins_) +} + +predictlogsum <- function(cat_, block_id_, nclust_, bins_, fs_binned_, a_) { + .Call('FUNLDA_predictlogsum', PACKAGE = 'FUNLDA', cat_, block_id_, nclust_, bins_, fs_binned_, a_) +} diff --git a/R/data.R b/R/data.R new file mode 100644 index 0000000..1d28bf8 --- /dev/null +++ b/R/data.R @@ -0,0 +1,12 @@ +#' Toy training data +#' +#' A dataset containing genomic annotations for 4800 variants, 200 +#' variants for each of 24 tissues +"training" + +#' Data for a tissue not used to to fit the LDA model +#' +#' A dataset containing genomic annotations for 200 variants +#' for a tissue not included in dataset training +"tissuenew" + diff --git a/R/ebmme.R b/R/ebmme.R new file mode 100755 index 0000000..8cfb81e --- /dev/null +++ b/R/ebmme.R @@ -0,0 +1,121 @@ +ebmme.FitNewTissue <- function(data, cat, block_id, nclust, levels, + fs_binned, inner.iters){ + + bins <- matrix(NA, nrow=nrow(data), ncol=ncol(data)) + for (i in 1:ncol(data)){ + levs <- unlist(strsplit(levels[, i], ",")) + levs <- gsub("\\(", "", levs) + levs <- gsub("]", "", levs) + levs <- sort(as.numeric(unique(levs))) + levs[1] <- min(levs[1], min(data[, i])) + levs[length(levs)] <- max(levs[length(levs)], max(data[, i])) + bins[, i] <- cut(data[, i], breaks=levs, include.lowest=T) + bins[, i] <- bins[, i] - 1 + } + fs_binned_num <- as.numeric(fs_binned) + fs_binned_num[fs_binned_num==0] = min(fs_binned_num[fs_binned_num>0]) + dims <- dim(fs_binned) + fs_binned <- array(fs_binned_num, dim=dims) + alpha<-vector('numeric',nclust) + for (k in 1:nclust){ + alpha[k] <- 1; + } + newtissue <- newtissue(cat, block_id, nclust, bins, fs_binned, alpha, inner.iters) + p <- newtissue$p + a <- newtissue$a + list(p=p, a=a) +} + +ebmme.Predict <- function(data, cat, block_id, nclust, levels, + fs_binned, a){ + + bins <- matrix(NA, nrow=nrow(data), ncol=ncol(data)) + for (i in 1:ncol(data)){ + levs <- unlist(strsplit(levels[, i], ",")) + levs <- gsub("\\(", "", levs) + levs <- gsub("]", "", levs) + levs <- sort(as.numeric(unique(levs))) + levs[1] <- min(levs[1], min(data[, i])) + levs[length(levs)] <- max(levs[length(levs)], max(data[, i])) + bins[, i] <- cut(data[, i], breaks=levs, include.lowest=T) + bins[, i] <- bins[, i] - 1 + } + fs_binned_num <- as.numeric(fs_binned) + fs_binned_num[fs_binned_num==0] = min(fs_binned_num[fs_binned_num>0]) + dims <- dim(fs_binned) + fs_binned <- array(fs_binned_num, dim=dims) + prediction <- predictlogsum(cat, block_id, nclust, bins, fs_binned, a) + p <- prediction$p + a <- prediction$a + f <- prediction$f + list(p=p, a=a, f=f) +} + +# cat must start with 0 and consist of consecutive integers for the different categories +ebmme.lda <- function(data, cat, block.id, iters = 100, + inner.iters = 100, + nclust=10, kde.nbins=NULL) +{ + set.seed(1) + data <- as.matrix(data) + m <- nrow(data) + k <- ncol(data) + B <- length(unique(block.id)) + + H.inv <- matrix(0, k, k) + for (l in 1:k){ + data.l <- as.matrix( data[, l] ) + h <- 0.9*m^-0.2*min(stats::sd(data.l), stats::IQR(data.l)/1.34) + H.inv[l, l] <- 1/h + } + + H1.inv <- H.inv + rm(H.inv) + # initialize p + p <- matrix(0, nclust, m) + for (k in 1:nclust) + for (i in 1:m) + p[k,i]=stats::runif(1); + for (k in 1:nclust) + for (i in 1:m) + p[k,i]=p[k,i]/sum(p[,i]); + + + # initialize alpha + alpha<-vector('numeric',nclust) + for (k in 1:nclust){ + alpha[k] <- 1; + } + + bins <- matrix(NA, nrow=nrow(data), ncol=ncol(data)) + bin.data <- matrix(NA, nrow=kde.nbins, ncol=ncol(data)) + levels.data <- matrix(NA, nrow=kde.nbins, ncol=ncol(data)) + colnames(levels.data) <- colnames(data) + for (i in 1:ncol(data)){ + # factor information is lost when put into matrix + cc <- cut(data[, i], breaks=kde.nbins) + bins[, i] <- cc + bottoms <- unlist(strsplit(levels(cc), ","))[c(T,F)] + tops <- unlist(strsplit(levels(cc), ","))[c(F,T)] + bottoms <- as.numeric(gsub("\\(", "", bottoms)) + tops <- as.numeric(gsub("]", "", tops)) + bin.data[, i] <- (bottoms + tops) / 2 + levels.data[, i] <- levels(cc) + } + # need to change array indexing for C++, bin number and category number should start from 0 + bins <- bins - 1 + fit <- ebmme_cpp_binned(data, cat, block.id, H1.inv, p, alpha, iters, + inner.iters, nclust, + bins, bin.data, kde.nbins) + p <- as.matrix( fit$p ) + alpha <- as.vector( fit$alpha ) + a <- as.matrix( fit$a ) + f <- as.matrix( fit$f ) + return(list(p=p, alpha=alpha, + a=a, f=f, fs_binned=fit$fs_binned, + bins=bins, bin.data=bin.data, kde.nbins=kde.nbins, + data=data, cat=cat, block.id=block.id, H1.inv=H1.inv, + iters=iters, inner.iters=inner.iters, + nclust=nclust, + levels.data=levels.data, all_as=fit$all_as)) +} diff --git a/R/help.R b/R/help.R new file mode 100644 index 0000000..75c479c --- /dev/null +++ b/R/help.R @@ -0,0 +1,7 @@ +#'FUNLDA - a latent Dirichlet allocation package for genomic annotations +#' @docType package +#' @name FUNLDA +#' @useDynLib FUNLDA, .registration = TRUE +#' @importFrom Rcpp sourceCpp +NULL + diff --git a/data/tissuenew.RData b/data/tissuenew.RData new file mode 100644 index 0000000..463ffe2 Binary files /dev/null and b/data/tissuenew.RData differ diff --git a/data/training.RData b/data/training.RData new file mode 100644 index 0000000..b0d337b Binary files /dev/null and b/data/training.RData differ diff --git a/man/FUNLDA.Rd b/man/FUNLDA.Rd new file mode 100644 index 0000000..ead75ac --- /dev/null +++ b/man/FUNLDA.Rd @@ -0,0 +1,10 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/help.R +\docType{package} +\name{FUNLDA} +\alias{FUNLDA} +\alias{FUNLDA-package} +\title{FUNLDA - a latent Dirichlet allocation package for genomic annotations} +\description{ +FUNLDA - a latent Dirichlet allocation package for genomic annotations +} diff --git a/man/FitLDAModel.Rd b/man/FitLDAModel.Rd new file mode 100644 index 0000000..1f2b6bb --- /dev/null +++ b/man/FitLDAModel.Rd @@ -0,0 +1,50 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FunLDA.R +\name{FitLDAModel} +\alias{FitLDAModel} +\title{fit an LDA model} +\usage{ +FitLDAModel(training.data, nclust = 9, kde.nbins = 1000, iters = 250, + inner.iters = 200) +} +\arguments{ +\item{training.data}{A data frame with character-valued columns +rs and cat and numeric-valued columns with annotations. +Each row is data for one SNP in one tissue. +rs is an ID for the SNP, which need not be unique, and +cat is an ID for each tissue.} + +\item{nclust}{Integer specifying the number of clusters to estimate} + +\item{kde.nbins}{Integer specifying how many bins to use for binning +each annotation} + +\item{iters}{Integer specifying number of outer iterations} + +\item{inner.iters}{Integer specifying number of inner iterations} +} +\value{ +A fitted LDA model, i.e., a list (apart from elements + used internally) with elements + \describe{ + \item{p.labeled}{a data frame with one row per + training variant, with + the posterior probability for each + variant to be in + each cluster in columns CLUSTER1,... and + also with columns cat and rs} + \item{a.labeled}{a data frame with one row per tissue + with membership vectors for each tissue + with columns cat and CLUSTER1,...} + } +} +\description{ +fit an LDA model +} +\examples{ +\dontrun{ + data(training) + summary.c <- FitLDAModel(training.data=training, nclust=3, + kde.nbins=100, iters=50, inner.iters=50) +} +} diff --git a/man/FitLDAModelNewTissues.Rd b/man/FitLDAModelNewTissues.Rd new file mode 100644 index 0000000..8ed14b0 --- /dev/null +++ b/man/FitLDAModelNewTissues.Rd @@ -0,0 +1,42 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FunLDA.R +\name{FitLDAModelNewTissues} +\alias{FitLDAModelNewTissues} +\title{Update the tissues in a fitted LDA model} +\usage{ +FitLDAModelNewTissues(newtissue.data, summary.c, inner.iters = 200) +} +\arguments{ +\item{newtissue.data}{A data frame with character-valued columns +rs and cat and numeric-valued columns with annotations. +Each row contains data for one SNP in one tissue +rs is an ID for the SNP, which need not be unique, and +cat is an ID for each tissue, which must match an ID +for which training data was included when fitting the +LDA model. Annotation columns must have column names matching +those supplied when fitting the LDA model.} + +\item{summary.c}{A fitted LDA model created using +\code{\link{FitLDAModel}} or \code{\link{FitLDAModelNewTissues}}.} + +\item{inner.iters}{Integer specifying number of inner iterations} +} +\value{ +a fitted LDA model, as returned by \code{\link{FitLDAModel}}, + that can be used with \code{\link{Predict}} for + the tissues in newtissue.data +} +\description{ +Update the tissues in a fitted LDA model +} +\examples{ +\dontrun{ + data(training) + summary.c <- FitLDAModel(training.data=training, nclust=3, + kde.nbins=100, iters=50, inner.iters=50) + data(tissuenew) + newtissues <- FitLDAModelNewTissues(newtissue.data=tissuenew, + summary.c=summary.c, + inner.iters=50) +} +} diff --git a/man/Predict.Rd b/man/Predict.Rd new file mode 100644 index 0000000..57e16e6 --- /dev/null +++ b/man/Predict.Rd @@ -0,0 +1,40 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FunLDA.R +\name{Predict} +\alias{Predict} +\title{Predict posterior probabilities for variants to be in +each cluster in a fitted LDA model} +\usage{ +Predict(predict.data, summary.c) +} +\arguments{ +\item{predict.data}{A data frame with character-valued columns +rs and cat and numeric-valued columns with annotations. +Each row contains data for one SNP in one tissue. +rs is an ID for the SNP, which need not be unique, and cat +is an ID for each tissue, which must match an ID +for which training data was included when fitting the +LDA model. Aannotation columns must have column names matching +those supplied when fitting the LDA model.} + +\item{summary.c}{A fitted LDA model created using +\code{\link{FitLDAModel}} or \code{\link{FitLDAModelNewTissues}}.} +} +\value{ +p.labeled, a data frame with one row per training variant + with the posterior probability of each variant to be in each + each cluster (with column names CLUSTER1,...) and columns + cat and rs. +} +\description{ +Predict posterior probabilities for variants to be in +each cluster in a fitted LDA model +} +\examples{ +\dontrun{ + data(training) + summary.c <- FitLDAModel(training.data=training, nclust=3, + kde.nbins=100, iters=50, inner.iters=50) + pred <- Predict(predict.data=training, summary.c=summary.c) +} +} diff --git a/man/tissuenew.Rd b/man/tissuenew.Rd new file mode 100644 index 0000000..95e3373 --- /dev/null +++ b/man/tissuenew.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{tissuenew} +\alias{tissuenew} +\title{Data for a tissue not used to to fit the LDA model} +\format{An object of class \code{data.table} (inherits from \code{data.frame}) with 200 rows and 7 columns.} +\usage{ +tissuenew +} +\description{ +A dataset containing genomic annotations for 200 variants +for a tissue different than the 24 tissues variants for which +are included in training +} +\keyword{datasets} diff --git a/man/training.Rd b/man/training.Rd new file mode 100644 index 0000000..8460a90 --- /dev/null +++ b/man/training.Rd @@ -0,0 +1,15 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{training} +\alias{training} +\title{Toy training data} +\format{An object of class \code{data.table} (inherits from \code{data.frame}) with 4800 rows and 7 columns.} +\usage{ +training +} +\description{ +A dataset containing genomic annotations for 4800 variants, 200 +variants for each of 24 tissues +} +\keyword{datasets} diff --git a/src/Makevars b/src/Makevars new file mode 100644 index 0000000..22ebc63 --- /dev/null +++ b/src/Makevars @@ -0,0 +1 @@ +PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) diff --git a/src/Makevars.win b/src/Makevars.win new file mode 100644 index 0000000..9542baf --- /dev/null +++ b/src/Makevars.win @@ -0,0 +1,2 @@ + +PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp new file mode 100644 index 0000000..7204a15 --- /dev/null +++ b/src/RcppExports.cpp @@ -0,0 +1,62 @@ +// This file was generated by Rcpp::compileAttributes +// Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 + +#include + +using namespace Rcpp; + +// newtissue +RcppExport SEXP newtissue(SEXP cat_, SEXP block_id_, SEXP nclust_, SEXP bins_, SEXP fs_binned_, SEXP alpha_, SEXP inner_iters_); +RcppExport SEXP FUNLDA_newtissue(SEXP cat_SEXP, SEXP block_id_SEXP, SEXP nclust_SEXP, SEXP bins_SEXP, SEXP fs_binned_SEXP, SEXP alpha_SEXP, SEXP inner_iters_SEXP) { +BEGIN_RCPP + Rcpp::RObject __result; + Rcpp::RNGScope __rngScope; + Rcpp::traits::input_parameter< SEXP >::type cat_(cat_SEXP); + Rcpp::traits::input_parameter< SEXP >::type block_id_(block_id_SEXP); + Rcpp::traits::input_parameter< SEXP >::type nclust_(nclust_SEXP); + Rcpp::traits::input_parameter< SEXP >::type bins_(bins_SEXP); + Rcpp::traits::input_parameter< SEXP >::type fs_binned_(fs_binned_SEXP); + Rcpp::traits::input_parameter< SEXP >::type alpha_(alpha_SEXP); + Rcpp::traits::input_parameter< SEXP >::type inner_iters_(inner_iters_SEXP); + __result = Rcpp::wrap(newtissue(cat_, block_id_, nclust_, bins_, fs_binned_, alpha_, inner_iters_)); + return __result; +END_RCPP +} +// ebmme_cpp_binned +RcppExport SEXP ebmme_cpp_binned(SEXP dat_, SEXP cat_, SEXP block_id_, SEXP H1_inv_, SEXP p_, SEXP alpha_, SEXP iters_, SEXP inner_iters_, SEXP nclust_, SEXP bins_, SEXP bin_data_, SEXP kde_nbins_); +RcppExport SEXP FUNLDA_ebmme_cpp_binned(SEXP dat_SEXP, SEXP cat_SEXP, SEXP block_id_SEXP, SEXP H1_inv_SEXP, SEXP p_SEXP, SEXP alpha_SEXP, SEXP iters_SEXP, SEXP inner_iters_SEXP, SEXP nclust_SEXP, SEXP bins_SEXP, SEXP bin_data_SEXP, SEXP kde_nbins_SEXP) { +BEGIN_RCPP + Rcpp::RObject __result; + Rcpp::RNGScope __rngScope; + Rcpp::traits::input_parameter< SEXP >::type dat_(dat_SEXP); + Rcpp::traits::input_parameter< SEXP >::type cat_(cat_SEXP); + Rcpp::traits::input_parameter< SEXP >::type block_id_(block_id_SEXP); + Rcpp::traits::input_parameter< SEXP >::type H1_inv_(H1_inv_SEXP); + Rcpp::traits::input_parameter< SEXP >::type p_(p_SEXP); + Rcpp::traits::input_parameter< SEXP >::type alpha_(alpha_SEXP); + Rcpp::traits::input_parameter< SEXP >::type iters_(iters_SEXP); + Rcpp::traits::input_parameter< SEXP >::type inner_iters_(inner_iters_SEXP); + Rcpp::traits::input_parameter< SEXP >::type nclust_(nclust_SEXP); + Rcpp::traits::input_parameter< SEXP >::type bins_(bins_SEXP); + Rcpp::traits::input_parameter< SEXP >::type bin_data_(bin_data_SEXP); + Rcpp::traits::input_parameter< SEXP >::type kde_nbins_(kde_nbins_SEXP); + __result = Rcpp::wrap(ebmme_cpp_binned(dat_, cat_, block_id_, H1_inv_, p_, alpha_, iters_, inner_iters_, nclust_, bins_, bin_data_, kde_nbins_)); + return __result; +END_RCPP +} +// predictlogsum +RcppExport SEXP predictlogsum(SEXP cat_, SEXP block_id_, SEXP nclust_, SEXP bins_, SEXP fs_binned_, SEXP a_); +RcppExport SEXP FUNLDA_predictlogsum(SEXP cat_SEXP, SEXP block_id_SEXP, SEXP nclust_SEXP, SEXP bins_SEXP, SEXP fs_binned_SEXP, SEXP a_SEXP) { +BEGIN_RCPP + Rcpp::RObject __result; + Rcpp::RNGScope __rngScope; + Rcpp::traits::input_parameter< SEXP >::type cat_(cat_SEXP); + Rcpp::traits::input_parameter< SEXP >::type block_id_(block_id_SEXP); + Rcpp::traits::input_parameter< SEXP >::type nclust_(nclust_SEXP); + Rcpp::traits::input_parameter< SEXP >::type bins_(bins_SEXP); + Rcpp::traits::input_parameter< SEXP >::type fs_binned_(fs_binned_SEXP); + Rcpp::traits::input_parameter< SEXP >::type a_(a_SEXP); + __result = Rcpp::wrap(predictlogsum(cat_, block_id_, nclust_, bins_, fs_binned_, a_)); + return __result; +END_RCPP +} diff --git a/src/ebmme_cpp_mine_binned2.cpp b/src/ebmme_cpp_mine_binned2.cpp new file mode 100755 index 0000000..ec114f3 --- /dev/null +++ b/src/ebmme_cpp_mine_binned2.cpp @@ -0,0 +1,316 @@ +#include "utils.h" +#include + +// [[Rcpp::depends(RcppArmadillo)]] + +using namespace Rcpp; +using namespace arma; + + +// [[Rcpp::export]] +RcppExport SEXP newtissue(SEXP cat_, SEXP block_id_, + SEXP nclust_, + SEXP bins_, + SEXP fs_binned_, + SEXP alpha_, + SEXP inner_iters_) +{ + mat bins = as(bins_); + int m = bins.n_rows; + int inner_iters = as(inner_iters_); + + vec cat = as(cat_); + vec cat_id = unique(cat); + int ncat = cat_id.n_elem; + int nclust = as(nclust_); + + mat p = zeros(nclust,m); + vec block_id = as(block_id_); + vec ublock_id = unique(block_id); + int B = ublock_id.n_elem; + + cube fs = zeros(m, B, nclust); + + mat f = zeros(m, nclust); + cube fs_binned = as(fs_binned_); + + + vec alpha = as(alpha_); + mat a = zeros(nclust,ncat); + + mat w_sum = zeros(nclust,ncat); + + for (int i = 0; i < m; i++){ + for (int k = 0; k < nclust; k++){ + for (int b=0; b < B; b++){ + fs(i, b, k) = fs_binned(bins(i, b), b, k); + } + } + } + for (int i = 0; i < m; i++){ + for (int k = 0; k < nclust; k++){ + f(i, k) = accu(log(fs.slice(k).row(i))); + } + } + for(int i = 0; i < inner_iters; i++){ + vec tsum=zeros(m); + for (int ct=0; ct < ncat; ct++) + for (int k=0; k < nclust; k++) + { + w_sum(k,ct)=0; + for(int j = 0; j < m; j++) + if (cat(j)==ct) w_sum(k,ct)=w_sum(k,ct)+p(k,j); + } + + for (int k=0; k < nclust; k++){ + for (int ct=0; ct < ncat; ct++){ + a(k,ct) = alpha(k) + w_sum(k,ct); + } + } + for(int j = 0; j < m; j++){ + for (int k=0; k < nclust; k++){ + p(k,j) = digamma(a(k,cat(j))) + f(j,k); + } + double maxv = max(p.col(j)); + for (int k=0; k < nclust; k++){ + p(k,j) = p(k,j) - maxv; + } + } + for(int j = 0; j < m; j++){ + double s = accu(exp(p.col(j))); + for (int k=0; k(dat_); + vec cat = as(cat_); + mat bins = as(bins_); + mat bin_data = as(bin_data_); + int m = dat.n_rows; + + vec cat_id = unique(cat); + int ncat = cat_id.n_elem; + + int iters = as(iters_); + int inner_iters = as(inner_iters_); + int nclust = as(nclust_); + int nbins = as(kde_nbins_); + + mat H1_inv = as(H1_inv_); + mat p = as(p_); + + vec alpha = as(alpha_); + mat a = zeros(nclust,ncat); + cube all_as = zeros(nclust,ncat,iters); + + vec alpha0=zeros(nclust); + alpha0 = alpha; + + mat w_sum = zeros(nclust,ncat); + + vec block_id = as(block_id_); + vec ublock_id = unique(block_id); + int B = ublock_id.n_elem; + + mat f = zeros(m, nclust); + + vec df = zeros(nclust); + mat d2f = zeros(nclust, nclust); + mat d2f_inv = zeros(nclust, nclust); + + vec lb = zeros(iters); + + cube fs_binned_save; + + for (int iter = 0; iter < iters; iter++){ + Rcpp::checkUserInterrupt(); + Rcout << '.'; + cube fs = zeros(m, B, nclust); + cube fs_binned = zeros(nbins, B, nclust); + vec p_sum = sum(p, 1); + for (int b = 0; b < B; b++){ + mat p_binned = zeros(nclust, nbins); + for (int i = 0; i < m; i++){ + for (int k = 0; k < nclust; k++){ + p_binned(k, bins(i, b)) += p(k, i); + } + } + uvec index = find(block_id == ublock_id(b)); + mat H1_inv_b = H1_inv.submat(index, index); + for(int i = 0; i < nbins; i++){ + rowvec zi = bin_data.row(i); + for(int j = 0; j < nbins; j++){ + rowvec zj = bin_data.row(j); + vec x = zi(index) - zj(index); + if (!x.has_nan()){ + for (int k = 0; k < nclust; k++){ + fs_binned(i, b, k) += p_binned(k, j) * exp(-.5 * x(0) * H1_inv_b(0,0) * x(0)); + } + } + } + } + } + + for (int k = 0; k < nclust; k++){ + for (int b=0; b < B; b++){ + uvec index = find(block_id == ublock_id(b)); + mat H1_inv_b = H1_inv.submat(index, index); + double det_H1_inv_b = arma::det(H1_inv_b); + for (int i = 0; i < nbins; i++){ + fs_binned(i, b, k) *= sqrt(det_H1_inv_b) / p_sum(k); + } + } + } + fs_binned_save = fs_binned; + for (int i = 0; i < m; i++){ + for (int k = 0; k < nclust; k++){ + for (int b=0; b < B; b++){ + fs(i, b, k) = fs_binned(bins(i, b), b, k); + } + } + } + + for (int i = 0; i < m; i++){ + for (int k = 0; k < nclust; k++){ + f(i, k) = prod(fs.slice(k).row(i)); + } + } + + // the inner loop to estimate the variational parameters + for(int i = 0; i < inner_iters; i++){ + Rcpp::checkUserInterrupt(); + vec tsum=zeros(m); + for (int ct=0; ct < ncat; ct++) + for (int k=0; k < nclust; k++) + { + w_sum(k,ct)=0; + for(int j = 0; j < m; j++) + if (cat(j)==ct) w_sum(k,ct)=w_sum(k,ct)+p(k,j); + } + + for (int k=0; k < nclust; k++){ + for (int ct=0; ct < ncat; ct++){ + a(k,ct) = alpha(k) + w_sum(k,ct); + all_as(k,ct,iter) = a(k,ct); + } + } + for (int k=0; k < nclust; k++) + for(int j = 0; j < m; j++) + { + p(k,j) = exp(digamma(a(k,cat(j)))) * f(j,k); + tsum(j)=tsum(j)+p(k,j); + } + + + for (int k=0; k(cube_); + return List::create(Named("cube") = d); +} + + +// [[Rcpp::export]] +RcppExport SEXP predictlogsum(SEXP cat_, SEXP block_id_, + SEXP nclust_, + SEXP bins_, + SEXP fs_binned_, SEXP a_) +{ + mat bins = as(bins_); + int m = bins.n_rows; + + vec cat = as(cat_); + + int nclust = as(nclust_); + + mat p = zeros(nclust,m); + vec block_id = as(block_id_); + vec ublock_id = unique(block_id); + int B = ublock_id.n_elem; + + cube fs = zeros(m, B, nclust); + + mat f = zeros(m, nclust); + cube fs_binned = as(fs_binned_); + mat a = as(a_); + for (int i = 0; i < m; i++){ + for (int k = 0; k < nclust; k++){ + for (int b=0; b < B; b++){ + fs(i, b, k) = fs_binned(bins(i, b), b, k); + } + } + } + for (int i = 0; i < m; i++){ + for (int k = 0; k < nclust; k++){ + f(i, k) = accu(log(fs.slice(k).row(i))); + } + } + for(int j = 0; j < m; j++){ + for (int k=0; k < nclust; k++){ + p(k,j) = digamma(a(k,cat(j))) + f(j,k); + } + double maxv = max(p.col(j)); + for (int k=0; k < nclust; k++){ + p(k,j) = p(k,j) - maxv; + } + } + for(int j = 0; j < m; j++){ + double s = accu(exp(p.col(j))); + for (int k=0; k +#include +#include // for NULL +#include + +/* FIXME: + * Check these declarations against the C/Fortran source code. + * */ + +/* .Call calls */ +extern SEXP FUNLDA_ebmme_cpp_binned(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); +extern SEXP FUNLDA_newtissue(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); +extern SEXP FUNLDA_predictlogsum(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); + +static const R_CallMethodDef CallEntries[] = { + {"FUNLDA_ebmme_cpp_binned", (DL_FUNC) &FUNLDA_ebmme_cpp_binned, 12}, + {"FUNLDA_newtissue", (DL_FUNC) &FUNLDA_newtissue, 7}, + {"FUNLDA_predictlogsum", (DL_FUNC) &FUNLDA_predictlogsum, 6}, + {NULL, NULL, 0} +}; + +void R_init_FUNLDA(DllInfo *dll) +{ + R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); + R_useDynamicSymbols(dll, FALSE); +} + diff --git a/src/utils.cpp b/src/utils.cpp new file mode 100755 index 0000000..cb70f6c --- /dev/null +++ b/src/utils.cpp @@ -0,0 +1,66 @@ +#include "utils.h" + +double gamma(double x) +{ + double y = Rf_gammafn(x); + return y; +} + +double lgamma(double x) +{ + double y = Rf_lgammafn(x); + return y; +} + +double digamma(double x) +{ + double y = Rf_digamma(x); + return y; +} + +double trigamma(double x) +{ + double y = Rf_trigamma(x); + return y; +} + +vec gamma_vec(vec x) +{ + int n = x.n_elem; + vec y = zeros(n); + for(int i = 0; i < n; i++){ + y(i) = Rf_gammafn(x(i)); + } + return y; +} + +vec lgamma_vec(vec x) +{ + int n = x.n_elem; + vec y = zeros(n); + for(int i = 0; i < n; i++){ + y(i) = Rf_lgammafn(x(i)); + } + return(y); +} + +vec digamma_vec(vec x) +{ + int n = x.n_elem; + vec y = zeros(n); + for(int i = 0; i < n; i++){ + y(i) = Rf_digamma(x(i)); + } + return y; +} + +vec trigamma_vec(vec x) +{ + int n = x.n_elem; + vec y = zeros(n); + for(int i = 0; i < n; i++){ + y(i) = Rf_trigamma(x(i)); + } + return y; +} + diff --git a/src/utils.h b/src/utils.h new file mode 100755 index 0000000..a80d159 --- /dev/null +++ b/src/utils.h @@ -0,0 +1,27 @@ +#ifndef UTILS_H +#define UTILS_H + +#include +#include + +using namespace Rcpp ; +using namespace arma ; +using namespace std ; + +double gamma(double x); + +double lgamma(double x); + +double digamma(double x); + +double trigamma(double x); + +vec gamma_vec(vec x); + +vec lgamma_vec(vec x); + +vec digamma_vec(vec x); + +vec trigamma_vec(vec x); + +#endif