Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 99b1d1e
Showing
23 changed files
with
1,037 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
Package: FUNLDA | ||
Type: Package | ||
Title: Genomic Latent Dirichlet Allocation | ||
Description: A tool for fitting latent Dirichlet allocation models to genomic annotation data. | ||
Version: 1.0 | ||
Date: 2017-06-26 | ||
Author: Daniel Backenroth and Iuliana Ionita-Laza | ||
Maintainer: Daniel Backenroth <db2175@cumc.columbia.edu> | ||
License: GPL (>= 2) | ||
Imports: Rcpp (>= 0.12.2) | ||
LinkingTo: Rcpp, RcppArmadillo | ||
LazyData: true | ||
RoxygenNote: 6.0.1 | ||
NeedsCompilation: yes | ||
Packaged: 2017-07-06 13:40:52 UTC; db2175 | ||
Repository: CRAN | ||
Date/Publication: 2017-07-07 04:52:52 UTC |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
ef25f3000c5db96556f19d2a086bdd41 *DESCRIPTION | ||
c0b219166fed468c36e095bbdf768d5c *NAMESPACE | ||
0bd4c6e0488266fe4ccc30125308197e *R/FunLDA.R | ||
b5d429f3bd5f5c95fa58bda9b7a64454 *R/RcppExports.R | ||
628e08f4ae439d45a6ff1c2429bf6f86 *R/data.R | ||
9999740f97072d628484816cf3488ade *R/ebmme.R | ||
d0d253afd511ba598b6a1c3d5bfcc79f *R/help.R | ||
a5d536425f704efb3fe4d9885ca18962 *data/tissuenew.RData | ||
21d8c657a9091e5f3b9e19d8c00ee787 *data/training.RData | ||
c4f122c5edd100b038a0cc2d5881bd75 *man/FUNLDA.Rd | ||
48b1d099e90f47bd6f5e3d6e95cf42b2 *man/FitLDAModel.Rd | ||
f65a367be37872ad280c022714e8efc4 *man/FitLDAModelNewTissues.Rd | ||
141a347399ac43130bfa4429c60c9ce7 *man/Predict.Rd | ||
f786b9357d4afaf6e500db982881aa42 *man/tissuenew.Rd | ||
5a9b82d4629cc2bb4b2175ae5306d6ab *man/training.Rd | ||
2a6f9e9e044a78154d3cfda5936d6f48 *src/Makevars | ||
8d46d69896a1f1d31ed76035a0e49d67 *src/Makevars.win | ||
33e80dc519fe3bbc4a571e252ea0d23a *src/RcppExports.cpp | ||
695a279cd82d411d2cf09a00d458a346 *src/ebmme_cpp_mine_binned2.cpp | ||
e0669d61e598b7ee6778834b65bd2da7 *src/packagename_init.c | ||
6c4589ed2dfa308d9382601c2c04244d *src/utils.cpp | ||
a612da8cd6d0799816296517d2888c14 *src/utils.h |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
# Generated by roxygen2: do not edit by hand | ||
|
||
export(FitLDAModel) | ||
export(FitLDAModelNewTissues) | ||
export(Predict) | ||
importFrom(Rcpp,sourceCpp) | ||
useDynLib(FUNLDA, .registration = TRUE) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,163 @@ | ||
#' Predict posterior probabilities for variants to be in | ||
#' each cluster in a fitted LDA model | ||
#' | ||
#' @param predict.data A data frame with character-valued columns | ||
#' rs and cat and numeric-valued columns with annotations. | ||
#' Each row contains data for one SNP in one tissue. | ||
#' rs is an ID for the SNP, which need not be unique, and cat | ||
#' is an ID for each tissue, which must match an ID | ||
#' for which training data was included when fitting the | ||
#' LDA model. Aannotation columns must have column names matching | ||
#' those supplied when fitting the LDA model. | ||
#' @param summary.c A fitted LDA model created using | ||
#' \code{\link{FitLDAModel}} or \code{\link{FitLDAModelNewTissues}}. | ||
#' @return p.labeled, a data frame with one row per training variant | ||
#' with the posterior probability of each variant to be in each | ||
#' each cluster (with column names CLUSTER1,...) and columns | ||
#' cat and rs. | ||
#' @examples | ||
#' \dontrun{ | ||
#' data(training) | ||
#' summary.c <- FitLDAModel(training.data=training, nclust=3, | ||
#' kde.nbins=100, iters=50, inner.iters=50) | ||
#' pred <- Predict(predict.data=training, summary.c=summary.c) | ||
#' } | ||
#' @export | ||
Predict <- function(predict.data, summary.c){ | ||
predict.data <- as.data.frame(predict.data) | ||
z <- predict.data[, colnames(summary.c$data)] | ||
cat.dict <- summary.c$categories$ncat | ||
names(cat.dict) <- summary.c$categories$cat | ||
cat <- cat.dict[predict.data$cat] | ||
pred <- ebmme.Predict(data=z, cat=cat, | ||
block_id=summary.c$block.id, | ||
nclust=summary.c$nclust, | ||
levels=summary.c$levels.data, | ||
fs_binned=summary.c$fs_binned, | ||
a=summary.c$a) | ||
p.labeled <- as.data.frame(t(pred$p)) | ||
colnames(p.labeled) <- paste0("CLUSTER", 1:ncol(p.labeled)) | ||
p.labeled <- cbind(p.labeled, predict.data[, c("cat", "rs")]) | ||
return(p.labeled) | ||
} | ||
|
||
#' Update the tissues in a fitted LDA model | ||
#' | ||
#' @param newtissue.data A data frame with character-valued columns | ||
#' rs and cat and numeric-valued columns with annotations. | ||
#' Each row contains data for one SNP in one tissue | ||
#' rs is an ID for the SNP, which need not be unique, and | ||
#' cat is an ID for each tissue, which must match an ID | ||
#' for which training data was included when fitting the | ||
#' LDA model. Annotation columns must have column names matching | ||
#' those supplied when fitting the LDA model. | ||
#' @inheritParams Predict | ||
#' @inheritParams FitLDAModel | ||
#' @return a fitted LDA model, as returned by \code{\link{FitLDAModel}}, | ||
#' that can be used with \code{\link{Predict}} for | ||
#' the tissues in newtissue.data | ||
#' @examples | ||
#' \dontrun{ | ||
#' data(training) | ||
#' summary.c <- FitLDAModel(training.data=training, nclust=3, | ||
#' kde.nbins=100, iters=50, inner.iters=50) | ||
#' data(tissuenew) | ||
#' newtissues <- FitLDAModelNewTissues(newtissue.data=tissuenew, | ||
#' summary.c=summary.c, | ||
#' inner.iters=50) | ||
#' } | ||
#' @export | ||
FitLDAModelNewTissues <- function(newtissue.data, summary.c, inner.iters=200){ | ||
newtissue.data <- as.data.frame(newtissue.data) | ||
z <- newtissue.data[, colnames(summary.c$data)] | ||
cat <- newtissue.data$cat | ||
ncat <- as.numeric(as.factor(cat)) - 1 | ||
df <- unique(data.frame(cat=cat, ncat=ncat)) | ||
pred <- ebmme.FitNewTissue(data=z, cat=ncat, | ||
block_id=summary.c$block.id, | ||
nclust=summary.c$nclust, | ||
levels=summary.c$levels.data, | ||
fs_binned=summary.c$fs_binned, | ||
inner.iters=inner.iters) | ||
p.labeled <- as.data.frame(t(pred$p)) | ||
colnames(p.labeled) <- paste0("CLUSTER", 1:ncol(p.labeled)) | ||
p.labeled <- cbind(p.labeled, newtissue.data[, c("cat", "rs")]) | ||
a.labeled <- as.data.frame(t(pred$a)) | ||
colnames(a.labeled) <- paste0("CLUSTER", 1:ncol(a.labeled)) | ||
a.labeled$ncat <- 0:max(df$ncat) | ||
a.labeled <- merge(a.labeled, df) | ||
a.labeled <- a.labeled[, !colnames(a.labeled)=="ncat"] | ||
summary.c$a <- pred$a | ||
summary.c$p <- pred$p | ||
summary.c$data <- z | ||
summary.c$cat <- ncat | ||
summary.c$all_as <- NULL | ||
summary.c$data.labeled <- NULL | ||
summary.c$categories <- df | ||
summary.c$a.labeled <- a.labeled | ||
summary.c$p.labeled <- p.labeled | ||
return(summary.c) | ||
} | ||
|
||
#' fit an LDA model | ||
#' @param training.data A data frame with character-valued columns | ||
#' rs and cat and numeric-valued columns with annotations. | ||
#' Each row is data for one SNP in one tissue. | ||
#' rs is an ID for the SNP, which need not be unique, and | ||
#' cat is an ID for each tissue. | ||
#' @param nclust Integer specifying the number of clusters to estimate | ||
#' @param kde.nbins Integer specifying how many bins to use for binning | ||
#' each annotation | ||
#' @param iters Integer specifying number of outer iterations | ||
#' @param inner.iters Integer specifying number of inner iterations | ||
#' @return A fitted LDA model, i.e., a list (apart from elements | ||
#' used internally) with elements | ||
#' \describe{ | ||
#' \item{p.labeled}{a data frame with one row per | ||
#' training variant, with | ||
#' the posterior probability for each | ||
#' variant to be in | ||
#' each cluster in columns CLUSTER1,... and | ||
#' also with columns cat and rs} | ||
#' \item{a.labeled}{a data frame with one row per tissue | ||
#' with membership vectors for each tissue | ||
#' with columns cat and CLUSTER1,...} | ||
#' } | ||
#' @examples | ||
#' \dontrun{ | ||
#' data(training) | ||
#' summary.c <- FitLDAModel(training.data=training, nclust=3, | ||
#' kde.nbins=100, iters=50, inner.iters=50) | ||
#' } | ||
#' @export | ||
FitLDAModel <- function(training.data, nclust=9, kde.nbins=1000, iters=250, | ||
inner.iters=200){ | ||
training.data <- as.data.frame(training.data) | ||
cat <- training.data$cat | ||
ncat <- as.numeric(as.factor(cat)) - 1 | ||
df <- unique(data.frame(cat=cat, ncat=ncat)) | ||
training.data <- as.data.frame(training.data) | ||
z <- training.data[, !colnames(training.data) %in% c("rs", "cat")] | ||
summary.c <- ebmme.lda(data= z, cat=ncat, block.id = 1:ncol(z), | ||
iters = iters, | ||
inner.iters = inner.iters, nclust=nclust, | ||
kde.nbins=kde.nbins) | ||
# fix up p and a | ||
training.data$order <- 1:nrow(training.data) | ||
data.labeled <- merge(training.data, df) | ||
data.labeled <- data.labeled[order(data.labeled$order), ] | ||
data.labeled <- data.labeled[, !colnames(data.labeled) %in% "order"] | ||
summary.c$data.labeled <- data.labeled | ||
summary.c$categories <- df | ||
a.labeled <- as.data.frame(t(summary.c$a)) | ||
colnames(a.labeled) <- paste0("CLUSTER", 1:ncol(a.labeled)) | ||
a.labeled$ncat <- 0:max(df$ncat) | ||
a.labeled <- merge(a.labeled, df) | ||
a.labeled <- a.labeled[, !colnames(a.labeled)=="ncat"] | ||
summary.c$a.labeled <- a.labeled | ||
p.labeled <- as.data.frame(t(summary.c$p)) | ||
colnames(p.labeled) <- paste0("CLUSTER", 1:ncol(p.labeled)) | ||
p.labeled <- cbind(p.labeled, data.labeled[, c("cat", "rs")]) | ||
summary.c$p.labeled <- p.labeled | ||
summary.c | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
# This file was generated by Rcpp::compileAttributes | ||
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 | ||
|
||
newtissue <- function(cat_, block_id_, nclust_, bins_, fs_binned_, alpha_, inner_iters_) { | ||
.Call('FUNLDA_newtissue', PACKAGE = 'FUNLDA', cat_, block_id_, nclust_, bins_, fs_binned_, alpha_, inner_iters_) | ||
} | ||
|
||
ebmme_cpp_binned <- function(dat_, cat_, block_id_, H1_inv_, p_, alpha_, iters_, inner_iters_, nclust_, bins_, bin_data_, kde_nbins_) { | ||
.Call('FUNLDA_ebmme_cpp_binned', PACKAGE = 'FUNLDA', dat_, cat_, block_id_, H1_inv_, p_, alpha_, iters_, inner_iters_, nclust_, bins_, bin_data_, kde_nbins_) | ||
} | ||
|
||
predictlogsum <- function(cat_, block_id_, nclust_, bins_, fs_binned_, a_) { | ||
.Call('FUNLDA_predictlogsum', PACKAGE = 'FUNLDA', cat_, block_id_, nclust_, bins_, fs_binned_, a_) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
#' Toy training data | ||
#' | ||
#' A dataset containing genomic annotations for 4800 variants, 200 | ||
#' variants for each of 24 tissues | ||
"training" | ||
|
||
#' Data for a tissue not used to to fit the LDA model | ||
#' | ||
#' A dataset containing genomic annotations for 200 variants | ||
#' for a tissue not included in dataset training | ||
"tissuenew" | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,121 @@ | ||
ebmme.FitNewTissue <- function(data, cat, block_id, nclust, levels, | ||
fs_binned, inner.iters){ | ||
|
||
bins <- matrix(NA, nrow=nrow(data), ncol=ncol(data)) | ||
for (i in 1:ncol(data)){ | ||
levs <- unlist(strsplit(levels[, i], ",")) | ||
levs <- gsub("\\(", "", levs) | ||
levs <- gsub("]", "", levs) | ||
levs <- sort(as.numeric(unique(levs))) | ||
levs[1] <- min(levs[1], min(data[, i])) | ||
levs[length(levs)] <- max(levs[length(levs)], max(data[, i])) | ||
bins[, i] <- cut(data[, i], breaks=levs, include.lowest=T) | ||
bins[, i] <- bins[, i] - 1 | ||
} | ||
fs_binned_num <- as.numeric(fs_binned) | ||
fs_binned_num[fs_binned_num==0] = min(fs_binned_num[fs_binned_num>0]) | ||
dims <- dim(fs_binned) | ||
fs_binned <- array(fs_binned_num, dim=dims) | ||
alpha<-vector('numeric',nclust) | ||
for (k in 1:nclust){ | ||
alpha[k] <- 1; | ||
} | ||
newtissue <- newtissue(cat, block_id, nclust, bins, fs_binned, alpha, inner.iters) | ||
p <- newtissue$p | ||
a <- newtissue$a | ||
list(p=p, a=a) | ||
} | ||
|
||
ebmme.Predict <- function(data, cat, block_id, nclust, levels, | ||
fs_binned, a){ | ||
|
||
bins <- matrix(NA, nrow=nrow(data), ncol=ncol(data)) | ||
for (i in 1:ncol(data)){ | ||
levs <- unlist(strsplit(levels[, i], ",")) | ||
levs <- gsub("\\(", "", levs) | ||
levs <- gsub("]", "", levs) | ||
levs <- sort(as.numeric(unique(levs))) | ||
levs[1] <- min(levs[1], min(data[, i])) | ||
levs[length(levs)] <- max(levs[length(levs)], max(data[, i])) | ||
bins[, i] <- cut(data[, i], breaks=levs, include.lowest=T) | ||
bins[, i] <- bins[, i] - 1 | ||
} | ||
fs_binned_num <- as.numeric(fs_binned) | ||
fs_binned_num[fs_binned_num==0] = min(fs_binned_num[fs_binned_num>0]) | ||
dims <- dim(fs_binned) | ||
fs_binned <- array(fs_binned_num, dim=dims) | ||
prediction <- predictlogsum(cat, block_id, nclust, bins, fs_binned, a) | ||
p <- prediction$p | ||
a <- prediction$a | ||
f <- prediction$f | ||
list(p=p, a=a, f=f) | ||
} | ||
|
||
# cat must start with 0 and consist of consecutive integers for the different categories | ||
ebmme.lda <- function(data, cat, block.id, iters = 100, | ||
inner.iters = 100, | ||
nclust=10, kde.nbins=NULL) | ||
{ | ||
set.seed(1) | ||
data <- as.matrix(data) | ||
m <- nrow(data) | ||
k <- ncol(data) | ||
B <- length(unique(block.id)) | ||
|
||
H.inv <- matrix(0, k, k) | ||
for (l in 1:k){ | ||
data.l <- as.matrix( data[, l] ) | ||
h <- 0.9*m^-0.2*min(stats::sd(data.l), stats::IQR(data.l)/1.34) | ||
H.inv[l, l] <- 1/h | ||
} | ||
|
||
H1.inv <- H.inv | ||
rm(H.inv) | ||
# initialize p | ||
p <- matrix(0, nclust, m) | ||
for (k in 1:nclust) | ||
for (i in 1:m) | ||
p[k,i]=stats::runif(1); | ||
for (k in 1:nclust) | ||
for (i in 1:m) | ||
p[k,i]=p[k,i]/sum(p[,i]); | ||
|
||
|
||
# initialize alpha | ||
alpha<-vector('numeric',nclust) | ||
for (k in 1:nclust){ | ||
alpha[k] <- 1; | ||
} | ||
|
||
bins <- matrix(NA, nrow=nrow(data), ncol=ncol(data)) | ||
bin.data <- matrix(NA, nrow=kde.nbins, ncol=ncol(data)) | ||
levels.data <- matrix(NA, nrow=kde.nbins, ncol=ncol(data)) | ||
colnames(levels.data) <- colnames(data) | ||
for (i in 1:ncol(data)){ | ||
# factor information is lost when put into matrix | ||
cc <- cut(data[, i], breaks=kde.nbins) | ||
bins[, i] <- cc | ||
bottoms <- unlist(strsplit(levels(cc), ","))[c(T,F)] | ||
tops <- unlist(strsplit(levels(cc), ","))[c(F,T)] | ||
bottoms <- as.numeric(gsub("\\(", "", bottoms)) | ||
tops <- as.numeric(gsub("]", "", tops)) | ||
bin.data[, i] <- (bottoms + tops) / 2 | ||
levels.data[, i] <- levels(cc) | ||
} | ||
# need to change array indexing for C++, bin number and category number should start from 0 | ||
bins <- bins - 1 | ||
fit <- ebmme_cpp_binned(data, cat, block.id, H1.inv, p, alpha, iters, | ||
inner.iters, nclust, | ||
bins, bin.data, kde.nbins) | ||
p <- as.matrix( fit$p ) | ||
alpha <- as.vector( fit$alpha ) | ||
a <- as.matrix( fit$a ) | ||
f <- as.matrix( fit$f ) | ||
return(list(p=p, alpha=alpha, | ||
a=a, f=f, fs_binned=fit$fs_binned, | ||
bins=bins, bin.data=bin.data, kde.nbins=kde.nbins, | ||
data=data, cat=cat, block.id=block.id, H1.inv=H1.inv, | ||
iters=iters, inner.iters=inner.iters, | ||
nclust=nclust, | ||
levels.data=levels.data, all_as=fit$all_as)) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
#'FUNLDA - a latent Dirichlet allocation package for genomic annotations | ||
#' @docType package | ||
#' @name FUNLDA | ||
#' @useDynLib FUNLDA, .registration = TRUE | ||
#' @importFrom Rcpp sourceCpp | ||
NULL | ||
|
Binary file not shown.
Binary file not shown.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
Oops, something went wrong.