version 0.2.0

cran · Jul 29, 2015 · 5678fab · 5678fab
1 parent 9308093
commit 5678fab
Show file tree

Hide file tree

Showing 23 changed files with 621 additions and 150 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,16 +1,26 @@
 Package: slfm
 Type: Package
-Title: Tools for fitting sparse latent factor model
-Version: 0.1
+Title: Tools for Fitting Sparse Latent Factor Model
+Version: 0.2.0
 Author: Joao Duarte and Vinicius Mayrink
 Maintainer: Joao Duarte <jdanielnd@gmail.com>
-Description: slfm is a set of tools to find coherent patterns in microarray data using a Bayesian sparse latent factor model. Considerable effort has been put into making slfm fast and memory efficient, turning it an interesting alternative to simpler methods in terms of execution time.
+Description: Set of tools to find coherent patterns in microarray data
+    using a Bayesian sparse latent factor model (Duarte and Mayrink 2015 -
+    http://link.springer.com/chapter/10.1007%2F978-3-319-12454-4_15).
+    Considerable effort has been put into making slfm fast and memory efficient,
+    turning it an interesting alternative to simpler methods in terms
+    of execution time. It implements versions of the SLFM using both type
+    of mixtures: using a degenerate distribution or a very concentrated
+    normal distribution for the spike part of the mixture. It also implements
+    additional functions to help pre-process the data and fit the model
+    for a large number of arrays.
 URL: https://github.com/jdanielnd/slfm
+BugReports: https://github.com/jdanielnd/slfm/issues
 Depends: R (>= 3.1.0)
-Imports: Rcpp (>= 0.11.0), coda
+Imports: Rcpp (>= 0.11.0), coda, lattice
 LinkingTo: Rcpp, RcppArmadillo
 License: GPL-2
-Packaged: 2014-05-08 11:44:40 UTC; jdanielnd
 NeedsCompilation: yes
+Packaged: 2015-07-29 00:03:10 UTC; jdanielnd
 Repository: CRAN
-Date/Publication: 2014-05-08 15:00:06
+Date/Publication: 2015-07-29 06:21:43
diff --git a/MD5 b/MD5
@@ -1,17 +1,21 @@
-67f632137f2c72344fd625e100a956f8 *DESCRIPTION
-842e742ed56f4760c893e34cc569b8c9 *NAMESPACE
-e9743678da83348d9a7e570d75f7e1cf *NEWS
-014adecf1f180367189bb57bbca5195f *R/RcppExports.R
-5f03ec1b26946fd9756df798882cb4d9 *R/class_interval.r
-dce71247a3c7006c24a7e75259758a68 *R/process_matrix.r
-e56aec246621d85de4cc226d3d5a3f96 *R/slfm-package.r
-6295402ccac87eb26b013500e8a98633 *R/slfm.r
-6df4e4bfbf4d4f524495699ba00dc1d1 *R/slfm_list.r
-c693f6ae8f9d620381d2369d5a3852df *README.md
-23a0867b7f440797a1cd5e50889435e6 *man/process_matrix.Rd
-63a06fe206d9f2a1c266c5c44c9b6f4d *man/slfm-package.Rd
-a5b8b0defcabdc9c48b9ac9a2108896c *man/slfm.Rd
-677de0b916ca383f9b522735fd5c2fb4 *man/slfm_list.Rd
-c818f4958497e27ca11f51adab2aa3d4 *src/Makevars.win
-3bb4238fe46039b75ec894824dffd73d *src/RcppExports.cpp
-3e25ef05cb2044a839656bfae7ad5510 *src/gibbs.cpp
+09c6391016c6b753e69a7534e10b8727 *DESCRIPTION
+87b17ef5c347ec4ca6d1641066700c30 *NAMESPACE
+bd3d6f4d64ee1b4f4804ad427a2745d2 *NEWS
+3e11a37d28ca0244100992886a82d3d5 *R/RcppExports.R
+c2bee89222b5b1d2f8a3b4ee0d5c59a6 *R/class_interval.r
+baabf72d4bc884b4c1a64efa0ab6929f *R/plot_matrix.r
+87f7eeceee7273541541ba5c589103ec *R/process_matrix.r
+43023f17cf4534c205f467c51594dff3 *R/slfm-package.r
+75aa158541ee3aed3c5084841a138dbc *R/slfm.r
+ccb2e56a2738d8fd42b70f0bf0adc061 *R/slfm_list.r
+540f05f21a09f459cc33f9d244ac919d *man/plot_matrix.Rd
+59958c4ef88f62f062a57ede17dd64c7 *man/process_matrix.Rd
+50feb9e98613c1d6ee4e2ca0aa62a5fd *man/slfm-package.Rd
+7ca8f4fcb3b7aab69b8abaabecf6c60c *man/slfm.Rd
+c34c4e9de5e7ea3372dd64a3f187bc2f *man/slfm_list.Rd
+bcc42aaeb0a69ec1b279b42021751d66 *src/Makevars
+3ef757384e0c95c1492b069cf4548b01 *src/Makevars.win
+a3d13222e4eea03a3a5673c2a529bfe0 *src/RcppExports.cpp
+3bc74fd8d172c42465d00b496b4753de *src/gibbs.cpp
+940a1289008060e66afcc95dda92d9a6 *src/slfm_MDN.cpp
+42a422b6d0c3b3d44a779cad0174ecc0 *src/slfm_MNN.cpp
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,8 +1,21 @@
+# Generated by roxygen2 (4.1.1): do not edit by hand
+
+export(plot_matrix)
 export(process_matrix)
 export(slfm)
 export(slfm_list)
 importFrom(Rcpp,evalCpp)
 importFrom(coda,HPDinterval)
 importFrom(coda,as.mcmc)
+importFrom(grDevices,colorRampPalette)
+importFrom(lattice,levelplot)
+importFrom(stats,cov)
+importFrom(stats,median)
+importFrom(stats,quantile)
+importFrom(stats,window)
 importFrom(tools,file_path_sans_ext)
+importFrom(utils,read.table)
+importFrom(utils,setTxtProgressBar)
+importFrom(utils,txtProgressBar)
+importFrom(utils,write.table)
 useDynLib(slfm)
diff --git a/NEWS b/NEWS
@@ -1,4 +1,15 @@
-slfm 0.1
+slfm 0.2.0
+===========
+
+* alpha inference now uses just the interested part of the chain
+
+* user can now choose which type of mixture to use (degenerate TRUE or FALSE)
+
+* user can now select a lag for MCMC chain
+
+* new `plot_matrix` function
+
+slfm 0.1.0
 ===========
 
 * new `slfm` function which fits a model to a matrix of microarray data

diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -1,7 +1,15 @@
 # This file was generated by Rcpp::compileAttributes
 # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 
-gibbs <- function(x, ite, a = 2.1, b = 1.1, gamma_a = 1, gamma_b = 1, omega = 10, omega_1 = 0.01) {
-    .Call('slfm_gibbs', PACKAGE = 'slfm', x, ite, a, b, gamma_a, gamma_b, omega, omega_1)
+gibbs <- function(x, ite, a = 2.1, b = 1.1, gamma_a = 1, gamma_b = 1, omega_0 = 0.01, omega_1 = 10, degenerate = FALSE) {
+    .Call('slfm_gibbs', PACKAGE = 'slfm', x, ite, a, b, gamma_a, gamma_b, omega_0, omega_1, degenerate)
+}
+
+slfm_MDN <- function(x, a = 2.1, b = 1.1, gamma_a = 1, gamma_b = 1, omega_1 = 10, burnin = 1000L, lag = 1L, npost = 500L) {
+    .Call('slfm_slfm_MDN', PACKAGE = 'slfm', x, a, b, gamma_a, gamma_b, omega_1, burnin, lag, npost)
+}
+
+slfm_MNN <- function(x, a = 2.1, b = 1.1, gamma_a = 1, gamma_b = 1, omega_0 = 0.01, omega_1 = 10, burnin = 1000L, lag = 1L, npost = 500L) {
+    .Call('slfm_slfm_MNN', PACKAGE = 'slfm', x, a, b, gamma_a, gamma_b, omega_0, omega_1, burnin, lag, npost)
 }
 
diff --git a/R/class_interval.r b/R/class_interval.r
@@ -9,7 +9,7 @@ class_interval_uni <- function(x, cv = 0.5) {
     return(clas)
 }
 
-class_interval <- function(x, cv = 0.5) {
+class_interval <- function(x) {
     apply(x, 1, class_interval_uni)
 }
 
@@ -21,4 +21,8 @@ format_classification <- function(x) {
   lvls[lvls == "A"] <- "Absent"
   levels(x) <- lvls
   x
+}
+
+class_mean <- function(x, cv=0.5) {
+  x[,1] > cv
 }
diff --git a/R/plot_matrix.r b/R/plot_matrix.r
@@ -0,0 +1,62 @@
+#' Plot matrix
+#'
+#' This function plots a data matrix
+#'
+#' @param y matrix to be ploted
+#' @param standardize.rows standardize matrix rows for plot
+#' @param reorder.rows reorder matrix rows based on pattern
+#' @param reorder.cols reorder matrix cols based on pattern
+#' @param high.contrast apply transformation to matrix to increase contrast
+#' @importFrom lattice levelplot
+#' @importFrom grDevices colorRampPalette
+#' @importFrom stats median
+#' @importFrom stats quantile
+#' @export
+
+plot_matrix <- function(y, standardize.rows = TRUE, reorder.rows = TRUE, reorder.cols = TRUE, high.contrast = TRUE) {
+
+  y <- as.matrix(y)
+  xl <- "Microarrays" 
+  yl <- "Probes"
+
+  if(standardize.rows) {
+    y <- t(apply(y,1,scale)) # standardize the rows of y
+  } 
+
+  if(reorder.rows) {
+    medrow <- apply(y,1,median) # median of the rows of y
+    y <- y[order(medrow),] # order the rows of y 
+  } 
+
+  if(reorder.cols) {
+    medcol <- apply(y,2,median) # median of the columns of y
+    y <- y[,order(medcol)]  # order the columns of y
+  }
+
+  if(high.contrast) {
+    y <- (abs(y)^(1/3))*sign(y)
+  } # higher contrast
+
+  mi <- as.numeric(quantile(y,0.001))
+  ma <- as.numeric(quantile(y,0.999))
+  nr <- nrow(y)
+  nc <- ncol(y)
+
+  if(nr==nc) { # Image of Correlation and Covariance matrices
+   xl <- ""
+   yl <- ""
+   mi <- ifelse(ma==1, -1, ma)
+  }
+
+  spr <- ifelse(nr<=5, 1, round(nr/10))
+  spc <- ifelse(nc<=5, 1, round(nc/10))
+  sc <- list(
+    x = list(at=c(seq(1,nc, spc),nc), labels=c(seq(1,nc,spc),nc)), 
+    y = list(draw=FALSE)
+  )
+  col.l <- colorRampPalette(c('blue','white','red'))
+  cbar <- seq(mi,ma,length.out=100)
+
+  levelplot(t(y[nr:1,]),col.regions=col.l,xlab=xl,ylab=yl,scales=sc,at=cbar,aspect="fill")
+
+}
diff --git a/R/process_matrix.r b/R/process_matrix.r
@@ -6,6 +6,9 @@
 #' @param path path containing the set of matrices to be processed
 #' @param output_path path to save the processed matrices
 #' @param sample_size number of matrices to be used on the principal component analysis
+#' @importFrom stats cov
+#' @importFrom utils read.table
+#' @importFrom utils write.table
 #' @export
 
 process_matrix <- function(path, output_path, sample_size) {

diff --git a/R/slfm-package.r b/R/slfm-package.r
@@ -1,15 +1,20 @@
 #' slfm: the sparse latent factor model package for R.
 #'
-#' slfm is a set of tools to fit sparse latent factor models to microarray data. This includes
-#' functions to:
+#' Set of tools to find coherent patterns in microarray data
+#' using a Bayesian sparse latent factor model (Duarte and Mayrink 2015 -
+#' http://link.springer.com/chapter/10.1007%2F978-3-319-12454-4_15).
+#' Considerable effort has been put into making slfm fast and memory efficient,
+#' turning it an interesting alternative to simpler methods in terms
+#' of execution time. It implements versions of the SLFM using both type
+#' of mixtures: using a degenerate distribution or a very concentrated
+#' normal distribution for the spike part of the mixture. It also implements
+#' additional functions to help pre-process the data and fit the model
+#' for a large number of arrays. Includes functions to:
 #' 
 #'   * pre-process a set of matrices
 #'   * fit models to a set of matrices
 #'   * detailed summary of model fit
 #' 
-#' Considerable effort has been put into making slfm fast and memory efficient, so as slfm is an
-#' attractive alternative to simpler methods in terms of execution time.
-#'
 #' @docType package
 #' @name slfm-package
 NULL
diff --git a/R/slfm.r b/R/slfm.r
@@ -2,66 +2,91 @@
 #'
 #' This function is used to fit a Bayesian sparse
 #' latent factor model.
+#' 
+#' @references
+#' 1. Duarte, J. D. N. and Mayrink, V. D. (2015). Factor analysis with mixture modeling to evaluate coherent patterns in microarray data. In Interdisciplinary Bayesian Statistics, volume 118 of Springer Proceedings in Mathematics & Statistics, pages 185-195. Springer International Publishing.
 #'
 #' @param x matrix with the pre-processed data
-#' @param ite number of iterations of the MCMC algorithm
 #' @param a prior shape parameter for Gamma distribution 
 #' @param b prior scale parameter for Gamma distribution
 #' @param gamma_a prior parameter for Beta distribution
 #' @param gamma_b prior parameter for Beta distribution
-#' @param omega prior variance of the slab component
-#' @param omega_1 prior variance of the spike component
+#' @param omega_0 prior variance of the spike component
+#' @param omega_1 prior variance of the slab component
+#' @param sample sample size after burn-in
 #' @param burnin burn-in size
+#' @param lag lag for MCMC
+#' @param degenerate use the degenerate version of mixture
 #' @return x: data matrix
-#' @return p_star: matrix of MCMC chains for p_star parameter
+#' @return q_star: matrix of MCMC chains for q_star parameter
 #' @return alpha: summary table of MCMC chains for alpha parameter
 #' @return lambda: summary table of MCMC chains for lambda parameter
 #' @return sigma: summary table of MCMC chains for sigma parameter
 #' @return classification: classification of each alpha (`present`, `marginal`, `absent`)
 #' @export
 #' @importFrom coda as.mcmc
+#' @importFrom stats window
 #' @importFrom Rcpp evalCpp
 #' @useDynLib slfm
 #' @examples
 #' mat <- matrix(rnorm(2000), nrow = 20)
-#' slfm(mat, ite = 1000)
+#' slfm(mat, sample = 1000)
 slfm <- function(
-  x, ite, a = 2.1, b = 1.1, gamma_a = 1, gamma_b = 1,
-  omega = 10, omega_1 = 0.01, burnin = round(0.25*ite)) {
+  x, a = 2.1, b = 1.1, gamma_a = 1, gamma_b = 1,
+  omega_0 = 0.01, omega_1 = 10, sample = 1000, burnin = round(0.25*sample), lag = 1, degenerate = FALSE) {
 
+  ite <- sample + burnin
+
   # Convert the x input to numeric matrix
   x <- data.matrix(x)
 
-  res <- gibbs(x, ite, a, b, gamma_a, gamma_b, omega, omega_1)
-  p_star_matrix <- coda::as.mcmc(res[["p_star"]][burnin:ite,])
-  alpha_matrix <- coda::as.mcmc(res[["alpha"]][burnin:ite,])
-  lambda_matrix <- coda::as.mcmc(res[["lambda"]][burnin:ite,])
-  sigma_matrix <- coda::as.mcmc(res[["sigma"]][burnin:ite,])
+  if(degenerate) {
+    res <- slfm_MDN(x, a, b, gamma_a, gamma_b, omega_1, burnin, lag, sample)
+  } else {
+    res <- slfm_MNN(x, a, b, gamma_a, gamma_b, omega_0, omega_1, burnin, lag, sample)
+  }
+
+  after_burnin <- (burnin + 1):ite
+
+  q_star_matrix <- coda::as.mcmc(res[["qstar"]][after_burnin,])
+  q_star_matrix <- window(q_star_matrix, thin = lag)
+  hpds_q_star <- coda::HPDinterval(q_star_matrix)
+  stats_q_star <- summary(q_star_matrix)$statistics
+  alpha_clas <- format_classification(class_interval(hpds_q_star))
+  alpha_clas_mean <- class_mean(stats_q_star)
 
-  stats_alpha <- summary(alpha_matrix)$statistics
-  hpds_alpha <- coda::HPDinterval(alpha_matrix)
-  table_alpha <- cbind(stats_alpha, hpds_alpha)[,-4]
-  colnames(table_alpha)[4:5] = c("Upper HPD", "Lower HPD")
+  alpha_matrix <- coda::as.mcmc(res[["alpha"]][after_burnin,])
+  alpha_matrix <- window(alpha_matrix, thin = lag)
 
+  table_alpha <- alpha_estimation(res[["alpha"]][after_burnin,], alpha_clas_mean, res[["qstar"]][after_burnin,])
+
+  lambda_matrix <- coda::as.mcmc(res[["lambda"]][after_burnin,])
+  lambda_matrix <- window(lambda_matrix, thin = lag)
   stats_lambda <- summary(lambda_matrix)$statistics
   hpds_lambda <- coda::HPDinterval(lambda_matrix)
   table_lambda <- cbind(stats_lambda, hpds_lambda)[,-4]
   colnames(table_lambda)[4:5] = c("Upper HPD", "Lower HPD")
 
+  sigma_matrix <- coda::as.mcmc(res[["sigma2"]][after_burnin,])
+  sigma_matrix <- window(sigma_matrix, thin = lag)
   stats_sigma <- summary(sigma_matrix)$statistics
   hpds_sigma <- coda::HPDinterval(sigma_matrix)
   table_sigma <- cbind(stats_sigma, hpds_sigma)[,-4]
   colnames(table_sigma)[4:5] = c("Upper HPD", "Lower HPD")
 
-  hpds_p_star <- coda::HPDinterval(p_star_matrix)
-  alpha_clas <- format_classification(class_interval(hpds_p_star))
+  z_matrix <- coda::as.mcmc(res[["z"]][after_burnin,])
+  z_matrix <- window(z_matrix, thin = lag)
 
   obj <- list(
     x = x,
-    p_star = p_star_matrix,
     alpha = table_alpha,
     lambda = table_lambda,
     sigma = table_sigma,
+    alpha_matrix = alpha_matrix,
+    lambda_matrix = lambda_matrix,
+    sigma_matrix = sigma_matrix,
+    q_star = q_star_matrix,
+    z_matrix = z_matrix,
     classification = alpha_clas)
   class(obj) <- "slfm"
   obj
@@ -76,4 +101,22 @@ print.slfm <- function(x) {
   cat("\n")
   cat("Classification:","\n")
   print(x$classification)
+}
+
+alpha_estimation <- function(x, alpha_clas, q_star_matrix) {
+  table_list <- lapply(1:length(alpha_clas), function(i) {
+    chain_indicator <- q_star_matrix[, i] > 0.5
+    if(alpha_clas[i]) {
+      chain <- x[chain_indicator, i]
+    } else {
+      chain <- x[!chain_indicator, i]
+    }
+    chain.mcmc <- coda::as.mcmc(chain)
+    stats <- summary(chain.mcmc)$statistics
+    hpds <- coda::HPDinterval(chain.mcmc)
+    table <- c(stats, hpds)[-4]
+  })
+  table <- do.call(rbind, table_list)
+  colnames(table)[4:5] = c("Upper HPD", "Lower HPD")
+  table
 }