Skip to content

Commit

Permalink
version 1.0
Browse files Browse the repository at this point in the history
  • Loading branch information
Daniel Backenroth authored and cran-robot committed Jul 7, 2017
0 parents commit 99b1d1e
Show file tree
Hide file tree
Showing 23 changed files with 1,037 additions and 0 deletions.
17 changes: 17 additions & 0 deletions DESCRIPTION
@@ -0,0 +1,17 @@
Package: FUNLDA
Type: Package
Title: Genomic Latent Dirichlet Allocation
Description: A tool for fitting latent Dirichlet allocation models to genomic annotation data.
Version: 1.0
Date: 2017-06-26
Author: Daniel Backenroth and Iuliana Ionita-Laza
Maintainer: Daniel Backenroth <db2175@cumc.columbia.edu>
License: GPL (>= 2)
Imports: Rcpp (>= 0.12.2)
LinkingTo: Rcpp, RcppArmadillo
LazyData: true
RoxygenNote: 6.0.1
NeedsCompilation: yes
Packaged: 2017-07-06 13:40:52 UTC; db2175
Repository: CRAN
Date/Publication: 2017-07-07 04:52:52 UTC
22 changes: 22 additions & 0 deletions MD5
@@ -0,0 +1,22 @@
ef25f3000c5db96556f19d2a086bdd41 *DESCRIPTION
c0b219166fed468c36e095bbdf768d5c *NAMESPACE
0bd4c6e0488266fe4ccc30125308197e *R/FunLDA.R
b5d429f3bd5f5c95fa58bda9b7a64454 *R/RcppExports.R
628e08f4ae439d45a6ff1c2429bf6f86 *R/data.R
9999740f97072d628484816cf3488ade *R/ebmme.R
d0d253afd511ba598b6a1c3d5bfcc79f *R/help.R
a5d536425f704efb3fe4d9885ca18962 *data/tissuenew.RData
21d8c657a9091e5f3b9e19d8c00ee787 *data/training.RData
c4f122c5edd100b038a0cc2d5881bd75 *man/FUNLDA.Rd
48b1d099e90f47bd6f5e3d6e95cf42b2 *man/FitLDAModel.Rd
f65a367be37872ad280c022714e8efc4 *man/FitLDAModelNewTissues.Rd
141a347399ac43130bfa4429c60c9ce7 *man/Predict.Rd
f786b9357d4afaf6e500db982881aa42 *man/tissuenew.Rd
5a9b82d4629cc2bb4b2175ae5306d6ab *man/training.Rd
2a6f9e9e044a78154d3cfda5936d6f48 *src/Makevars
8d46d69896a1f1d31ed76035a0e49d67 *src/Makevars.win
33e80dc519fe3bbc4a571e252ea0d23a *src/RcppExports.cpp
695a279cd82d411d2cf09a00d458a346 *src/ebmme_cpp_mine_binned2.cpp
e0669d61e598b7ee6778834b65bd2da7 *src/packagename_init.c
6c4589ed2dfa308d9382601c2c04244d *src/utils.cpp
a612da8cd6d0799816296517d2888c14 *src/utils.h
7 changes: 7 additions & 0 deletions NAMESPACE
@@ -0,0 +1,7 @@
# Generated by roxygen2: do not edit by hand

export(FitLDAModel)
export(FitLDAModelNewTissues)
export(Predict)
importFrom(Rcpp,sourceCpp)
useDynLib(FUNLDA, .registration = TRUE)
163 changes: 163 additions & 0 deletions R/FunLDA.R
@@ -0,0 +1,163 @@
#' Predict posterior probabilities for variants to be in
#' each cluster in a fitted LDA model
#'
#' @param predict.data A data frame with character-valued columns
#' rs and cat and numeric-valued columns with annotations.
#' Each row contains data for one SNP in one tissue.
#' rs is an ID for the SNP, which need not be unique, and cat
#' is an ID for each tissue, which must match an ID
#' for which training data was included when fitting the
#' LDA model. Aannotation columns must have column names matching
#' those supplied when fitting the LDA model.
#' @param summary.c A fitted LDA model created using
#' \code{\link{FitLDAModel}} or \code{\link{FitLDAModelNewTissues}}.
#' @return p.labeled, a data frame with one row per training variant
#' with the posterior probability of each variant to be in each
#' each cluster (with column names CLUSTER1,...) and columns
#' cat and rs.
#' @examples
#' \dontrun{
#' data(training)
#' summary.c <- FitLDAModel(training.data=training, nclust=3,
#' kde.nbins=100, iters=50, inner.iters=50)
#' pred <- Predict(predict.data=training, summary.c=summary.c)
#' }
#' @export
Predict <- function(predict.data, summary.c){
predict.data <- as.data.frame(predict.data)
z <- predict.data[, colnames(summary.c$data)]
cat.dict <- summary.c$categories$ncat
names(cat.dict) <- summary.c$categories$cat
cat <- cat.dict[predict.data$cat]
pred <- ebmme.Predict(data=z, cat=cat,
block_id=summary.c$block.id,
nclust=summary.c$nclust,
levels=summary.c$levels.data,
fs_binned=summary.c$fs_binned,
a=summary.c$a)
p.labeled <- as.data.frame(t(pred$p))
colnames(p.labeled) <- paste0("CLUSTER", 1:ncol(p.labeled))
p.labeled <- cbind(p.labeled, predict.data[, c("cat", "rs")])
return(p.labeled)
}

#' Update the tissues in a fitted LDA model
#'
#' @param newtissue.data A data frame with character-valued columns
#' rs and cat and numeric-valued columns with annotations.
#' Each row contains data for one SNP in one tissue
#' rs is an ID for the SNP, which need not be unique, and
#' cat is an ID for each tissue, which must match an ID
#' for which training data was included when fitting the
#' LDA model. Annotation columns must have column names matching
#' those supplied when fitting the LDA model.
#' @inheritParams Predict
#' @inheritParams FitLDAModel
#' @return a fitted LDA model, as returned by \code{\link{FitLDAModel}},
#' that can be used with \code{\link{Predict}} for
#' the tissues in newtissue.data
#' @examples
#' \dontrun{
#' data(training)
#' summary.c <- FitLDAModel(training.data=training, nclust=3,
#' kde.nbins=100, iters=50, inner.iters=50)
#' data(tissuenew)
#' newtissues <- FitLDAModelNewTissues(newtissue.data=tissuenew,
#' summary.c=summary.c,
#' inner.iters=50)
#' }
#' @export
FitLDAModelNewTissues <- function(newtissue.data, summary.c, inner.iters=200){
newtissue.data <- as.data.frame(newtissue.data)
z <- newtissue.data[, colnames(summary.c$data)]
cat <- newtissue.data$cat
ncat <- as.numeric(as.factor(cat)) - 1
df <- unique(data.frame(cat=cat, ncat=ncat))
pred <- ebmme.FitNewTissue(data=z, cat=ncat,
block_id=summary.c$block.id,
nclust=summary.c$nclust,
levels=summary.c$levels.data,
fs_binned=summary.c$fs_binned,
inner.iters=inner.iters)
p.labeled <- as.data.frame(t(pred$p))
colnames(p.labeled) <- paste0("CLUSTER", 1:ncol(p.labeled))
p.labeled <- cbind(p.labeled, newtissue.data[, c("cat", "rs")])
a.labeled <- as.data.frame(t(pred$a))
colnames(a.labeled) <- paste0("CLUSTER", 1:ncol(a.labeled))
a.labeled$ncat <- 0:max(df$ncat)
a.labeled <- merge(a.labeled, df)
a.labeled <- a.labeled[, !colnames(a.labeled)=="ncat"]
summary.c$a <- pred$a
summary.c$p <- pred$p
summary.c$data <- z
summary.c$cat <- ncat
summary.c$all_as <- NULL
summary.c$data.labeled <- NULL
summary.c$categories <- df
summary.c$a.labeled <- a.labeled
summary.c$p.labeled <- p.labeled
return(summary.c)
}

#' fit an LDA model
#' @param training.data A data frame with character-valued columns
#' rs and cat and numeric-valued columns with annotations.
#' Each row is data for one SNP in one tissue.
#' rs is an ID for the SNP, which need not be unique, and
#' cat is an ID for each tissue.
#' @param nclust Integer specifying the number of clusters to estimate
#' @param kde.nbins Integer specifying how many bins to use for binning
#' each annotation
#' @param iters Integer specifying number of outer iterations
#' @param inner.iters Integer specifying number of inner iterations
#' @return A fitted LDA model, i.e., a list (apart from elements
#' used internally) with elements
#' \describe{
#' \item{p.labeled}{a data frame with one row per
#' training variant, with
#' the posterior probability for each
#' variant to be in
#' each cluster in columns CLUSTER1,... and
#' also with columns cat and rs}
#' \item{a.labeled}{a data frame with one row per tissue
#' with membership vectors for each tissue
#' with columns cat and CLUSTER1,...}
#' }
#' @examples
#' \dontrun{
#' data(training)
#' summary.c <- FitLDAModel(training.data=training, nclust=3,
#' kde.nbins=100, iters=50, inner.iters=50)
#' }
#' @export
FitLDAModel <- function(training.data, nclust=9, kde.nbins=1000, iters=250,
inner.iters=200){
training.data <- as.data.frame(training.data)
cat <- training.data$cat
ncat <- as.numeric(as.factor(cat)) - 1
df <- unique(data.frame(cat=cat, ncat=ncat))
training.data <- as.data.frame(training.data)
z <- training.data[, !colnames(training.data) %in% c("rs", "cat")]
summary.c <- ebmme.lda(data= z, cat=ncat, block.id = 1:ncol(z),
iters = iters,
inner.iters = inner.iters, nclust=nclust,
kde.nbins=kde.nbins)
# fix up p and a
training.data$order <- 1:nrow(training.data)
data.labeled <- merge(training.data, df)
data.labeled <- data.labeled[order(data.labeled$order), ]
data.labeled <- data.labeled[, !colnames(data.labeled) %in% "order"]
summary.c$data.labeled <- data.labeled
summary.c$categories <- df
a.labeled <- as.data.frame(t(summary.c$a))
colnames(a.labeled) <- paste0("CLUSTER", 1:ncol(a.labeled))
a.labeled$ncat <- 0:max(df$ncat)
a.labeled <- merge(a.labeled, df)
a.labeled <- a.labeled[, !colnames(a.labeled)=="ncat"]
summary.c$a.labeled <- a.labeled
p.labeled <- as.data.frame(t(summary.c$p))
colnames(p.labeled) <- paste0("CLUSTER", 1:ncol(p.labeled))
p.labeled <- cbind(p.labeled, data.labeled[, c("cat", "rs")])
summary.c$p.labeled <- p.labeled
summary.c
}
14 changes: 14 additions & 0 deletions R/RcppExports.R
@@ -0,0 +1,14 @@
# This file was generated by Rcpp::compileAttributes
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393

newtissue <- function(cat_, block_id_, nclust_, bins_, fs_binned_, alpha_, inner_iters_) {
.Call('FUNLDA_newtissue', PACKAGE = 'FUNLDA', cat_, block_id_, nclust_, bins_, fs_binned_, alpha_, inner_iters_)
}

ebmme_cpp_binned <- function(dat_, cat_, block_id_, H1_inv_, p_, alpha_, iters_, inner_iters_, nclust_, bins_, bin_data_, kde_nbins_) {
.Call('FUNLDA_ebmme_cpp_binned', PACKAGE = 'FUNLDA', dat_, cat_, block_id_, H1_inv_, p_, alpha_, iters_, inner_iters_, nclust_, bins_, bin_data_, kde_nbins_)
}

predictlogsum <- function(cat_, block_id_, nclust_, bins_, fs_binned_, a_) {
.Call('FUNLDA_predictlogsum', PACKAGE = 'FUNLDA', cat_, block_id_, nclust_, bins_, fs_binned_, a_)
}
12 changes: 12 additions & 0 deletions R/data.R
@@ -0,0 +1,12 @@
#' Toy training data
#'
#' A dataset containing genomic annotations for 4800 variants, 200
#' variants for each of 24 tissues
"training"

#' Data for a tissue not used to to fit the LDA model
#'
#' A dataset containing genomic annotations for 200 variants
#' for a tissue not included in dataset training
"tissuenew"

121 changes: 121 additions & 0 deletions R/ebmme.R
@@ -0,0 +1,121 @@
ebmme.FitNewTissue <- function(data, cat, block_id, nclust, levels,
fs_binned, inner.iters){

bins <- matrix(NA, nrow=nrow(data), ncol=ncol(data))
for (i in 1:ncol(data)){
levs <- unlist(strsplit(levels[, i], ","))
levs <- gsub("\\(", "", levs)
levs <- gsub("]", "", levs)
levs <- sort(as.numeric(unique(levs)))
levs[1] <- min(levs[1], min(data[, i]))
levs[length(levs)] <- max(levs[length(levs)], max(data[, i]))
bins[, i] <- cut(data[, i], breaks=levs, include.lowest=T)
bins[, i] <- bins[, i] - 1
}
fs_binned_num <- as.numeric(fs_binned)
fs_binned_num[fs_binned_num==0] = min(fs_binned_num[fs_binned_num>0])
dims <- dim(fs_binned)
fs_binned <- array(fs_binned_num, dim=dims)
alpha<-vector('numeric',nclust)
for (k in 1:nclust){
alpha[k] <- 1;
}
newtissue <- newtissue(cat, block_id, nclust, bins, fs_binned, alpha, inner.iters)
p <- newtissue$p
a <- newtissue$a
list(p=p, a=a)
}

ebmme.Predict <- function(data, cat, block_id, nclust, levels,
fs_binned, a){

bins <- matrix(NA, nrow=nrow(data), ncol=ncol(data))
for (i in 1:ncol(data)){
levs <- unlist(strsplit(levels[, i], ","))
levs <- gsub("\\(", "", levs)
levs <- gsub("]", "", levs)
levs <- sort(as.numeric(unique(levs)))
levs[1] <- min(levs[1], min(data[, i]))
levs[length(levs)] <- max(levs[length(levs)], max(data[, i]))
bins[, i] <- cut(data[, i], breaks=levs, include.lowest=T)
bins[, i] <- bins[, i] - 1
}
fs_binned_num <- as.numeric(fs_binned)
fs_binned_num[fs_binned_num==0] = min(fs_binned_num[fs_binned_num>0])
dims <- dim(fs_binned)
fs_binned <- array(fs_binned_num, dim=dims)
prediction <- predictlogsum(cat, block_id, nclust, bins, fs_binned, a)
p <- prediction$p
a <- prediction$a
f <- prediction$f
list(p=p, a=a, f=f)
}

# cat must start with 0 and consist of consecutive integers for the different categories
ebmme.lda <- function(data, cat, block.id, iters = 100,
inner.iters = 100,
nclust=10, kde.nbins=NULL)
{
set.seed(1)
data <- as.matrix(data)
m <- nrow(data)
k <- ncol(data)
B <- length(unique(block.id))

H.inv <- matrix(0, k, k)
for (l in 1:k){
data.l <- as.matrix( data[, l] )
h <- 0.9*m^-0.2*min(stats::sd(data.l), stats::IQR(data.l)/1.34)
H.inv[l, l] <- 1/h
}

H1.inv <- H.inv
rm(H.inv)
# initialize p
p <- matrix(0, nclust, m)
for (k in 1:nclust)
for (i in 1:m)
p[k,i]=stats::runif(1);
for (k in 1:nclust)
for (i in 1:m)
p[k,i]=p[k,i]/sum(p[,i]);


# initialize alpha
alpha<-vector('numeric',nclust)
for (k in 1:nclust){
alpha[k] <- 1;
}

bins <- matrix(NA, nrow=nrow(data), ncol=ncol(data))
bin.data <- matrix(NA, nrow=kde.nbins, ncol=ncol(data))
levels.data <- matrix(NA, nrow=kde.nbins, ncol=ncol(data))
colnames(levels.data) <- colnames(data)
for (i in 1:ncol(data)){
# factor information is lost when put into matrix
cc <- cut(data[, i], breaks=kde.nbins)
bins[, i] <- cc
bottoms <- unlist(strsplit(levels(cc), ","))[c(T,F)]
tops <- unlist(strsplit(levels(cc), ","))[c(F,T)]
bottoms <- as.numeric(gsub("\\(", "", bottoms))
tops <- as.numeric(gsub("]", "", tops))
bin.data[, i] <- (bottoms + tops) / 2
levels.data[, i] <- levels(cc)
}
# need to change array indexing for C++, bin number and category number should start from 0
bins <- bins - 1
fit <- ebmme_cpp_binned(data, cat, block.id, H1.inv, p, alpha, iters,
inner.iters, nclust,
bins, bin.data, kde.nbins)
p <- as.matrix( fit$p )
alpha <- as.vector( fit$alpha )
a <- as.matrix( fit$a )
f <- as.matrix( fit$f )
return(list(p=p, alpha=alpha,
a=a, f=f, fs_binned=fit$fs_binned,
bins=bins, bin.data=bin.data, kde.nbins=kde.nbins,
data=data, cat=cat, block.id=block.id, H1.inv=H1.inv,
iters=iters, inner.iters=inner.iters,
nclust=nclust,
levels.data=levels.data, all_as=fit$all_as))
}
7 changes: 7 additions & 0 deletions R/help.R
@@ -0,0 +1,7 @@
#'FUNLDA - a latent Dirichlet allocation package for genomic annotations
#' @docType package
#' @name FUNLDA
#' @useDynLib FUNLDA, .registration = TRUE
#' @importFrom Rcpp sourceCpp
NULL

Binary file added data/tissuenew.RData
Binary file not shown.
Binary file added data/training.RData
Binary file not shown.
10 changes: 10 additions & 0 deletions man/FUNLDA.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 99b1d1e

Please sign in to comment.