version 0.2.1

cran · Mar 5, 2024 · eb506d2 · eb506d2
commit eb506d2
Show file tree

Hide file tree

Showing 23 changed files with 2,950 additions and 0 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,30 @@
+Package: EMgaussian
+Encoding: UTF-8
+Type: Package
+Title: Expectation-Maximization Algorithm for Multivariate Normal
+        (Gaussian) with Missing Data
+Version: 0.2.1
+Date: 2024-02-15
+Authors@R: 
+  c(person(given = c("Carl","F."),
+           family = "Falk",
+           role = c("cre","aut"),
+           email = "carl.falk@mcgill.ca"))
+Maintainer: Carl F. Falk <carl.falk@mcgill.ca>
+Description: Initially designed to distribute code for estimating the Gaussian
+  graphical model with Lasso regularization, also known as the graphical lasso
+  (glasso), using an Expectation-Maximization (EM) algorithm based on work by
+  Städler and Bühlmann (2012) <doi:10.1007/s11222-010-9219-7>. As a byproduct,
+  code for estimating means and covariances (or the precision matrix) under a
+  multivariate normal (Gaussian) distribution is also available.
+License: GPL (>= 3)
+Imports: Rcpp, matrixcalc, Matrix, lavaan, glasso, glassoFast, caret
+Suggests: testthat (>= 3.0.0), psych, bootnet, qgraph, cglasso
+LinkingTo: Rcpp, RcppArmadillo
+Config/testthat/edition: 3
+RoxygenNote: 7.3.1
+NeedsCompilation: yes
+Packaged: 2024-03-01 15:07:00 UTC; CarlFalkMcGill
+Author: Carl F. Falk [cre, aut]
+Repository: CRAN
+Date/Publication: 2024-03-04 10:50:08 UTC
diff --git a/MD5 b/MD5
@@ -0,0 +1,22 @@
+a178f9f5f70605eb5789e4c7ffd5bc6f *DESCRIPTION
+daafa4fda98cf384bd33ecd2a3d230fc *NAMESPACE
+cf8bdc119d422fea64335ea290ec6d01 *R/RcppExports.R
+e1d186a05a5c04b6a41a70bc032773c6 *R/emcov.R
+1dd577a4eeb64734cc808296667dd3e6 *R/emprec.R
+7cab095ff6848adedcf1952a8aa59cd5 *R/ggm.R
+b943a1546a1e17be4eea235ec7b4616b *build/partial.rdb
+ed2e9e01256568618b6951a2b224dd5c *man/EMggm.Rd
+e4c699a9c8ce8c73baa16518daf6ae42 *man/em.cov.Rd
+d260e8e6ad96ddb00fdd7c279b1f1fa1 *man/em.prec.Rd
+5d45932fd026b43e72112847fd9361e0 *man/rhogrid.Rd
+8cd9e64f2037947977bdcaed49c3b55f *man/startvals.cov.Rd
+e62585629948e7dd143efbad43c2a6eb *src/Makevars
+e62585629948e7dd143efbad43c2a6eb *src/Makevars.win
+e19a1f9d4db95428293ee05cd2e62cdf *src/RcppExports.cpp
+a60ed2b870b65be4b8619f7750dca635 *src/rcpp_EMcov.cpp
+dd8f744419f5780f285d0412154d07de *src/rcpp_EMprec.cpp
+63ec83a77b368f463f9278ee3f9ed82f *tests/testthat.R
+c0dd06c77dff422f2f36fc10d8649d8d *tests/testthat/_snaps/em.md
+42762f7a0c3b6eee6494f3ffaa5f24c8 *tests/testthat/_snaps/glasso.md
+da5685828204bc3fadd6de6dc8f57b3c *tests/testthat/test-em.R
+5db3f4e35b8193bce6f460bc9e78bde0 *tests/testthat/test-glasso.R
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,21 @@
+# Generated by roxygen2: do not edit by hand
+
+export(EMggm)
+export(em.cov)
+export(em.prec)
+export(rhogrid)
+export(startvals.cov)
+importFrom(Matrix,forceSymmetric)
+importFrom(Matrix,nearPD)
+importFrom(Rcpp,evalCpp)
+importFrom(caret,createFolds)
+importFrom(glasso,glasso)
+importFrom(glassoFast,glassoFast)
+importFrom(lavaan,lavCor)
+importFrom(lavaan,lav_matrix_vechr)
+importFrom(lavaan,lav_matrix_vechr_reverse)
+importFrom(matrixcalc,is.positive.definite)
+importFrom(matrixcalc,is.symmetric.matrix)
+importFrom(stats,cov)
+importFrom(stats,cov2cor)
+useDynLib(EMgaussian)
diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -0,0 +1,35 @@
+# Generated by using Rcpp::compileAttributes() -> do not edit by hand
+# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
+
+imp1matcov <- function(D, muest, sigest) {
+    .Call('_EMgaussian_imp1matcov', PACKAGE = 'EMgaussian', D, muest, sigest)
+}
+
+imp2matcov <- function(D, sigest, t2) {
+    invisible(.Call('_EMgaussian_imp2matcov', PACKAGE = 'EMgaussian', D, sigest, t2))
+}
+
+EMcyclecov <- function(D, muest, sigest) {
+    .Call('_EMgaussian_EMcyclecov', PACKAGE = 'EMgaussian', D, muest, sigest)
+}
+
+nllcov <- function(d, muest, sigest) {
+    .Call('_EMgaussian_nllcov', PACKAGE = 'EMgaussian', d, muest, sigest)
+}
+
+imp1matprec <- function(D, muest, kest) {
+    .Call('_EMgaussian_imp1matprec', PACKAGE = 'EMgaussian', D, muest, kest)
+}
+
+imp2matprec <- function(D, kest, t2) {
+    invisible(.Call('_EMgaussian_imp2matprec', PACKAGE = 'EMgaussian', D, kest, t2))
+}
+
+EMcycleprec <- function(D, muest, kest) {
+    .Call('_EMgaussian_EMcycleprec', PACKAGE = 'EMgaussian', D, muest, kest)
+}
+
+nllprec <- function(d, muest, kest) {
+    .Call('_EMgaussian_nllprec', PACKAGE = 'EMgaussian', d, muest, kest)
+}
+
diff --git a/R/emcov.R b/R/emcov.R
@@ -0,0 +1,197 @@
+######################################################################
+##
+## Copyright 2019-2024 Carl F. Falk
+##
+## This program is free software: you can redistribute it and/or
+## modify it under the terms of the GNU General Public License as
+## published by the Free Software Foundation, either version 3 of
+## the License, or (at your option) any later version.
+##
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+## GNU General Public License for more details.
+## <http://www.gnu.org/licenses/>
+
+#' EM algorithm for multivariate normal, covariance matrix parameterization
+#' 
+#' @param dat Data frame or matrix that contains the raw data.
+#' @param max.iter Max number of EM cycles.
+#' @param tol Tolerance for change in parameter estimates across EM Cycles. If
+#'   all changes are less than \code{tol}, the algorithm terminates.
+#' @param start Starting value method (see details).
+#' @param debug (Experimental) set an integer value > 1 for some information as the algorithm runs.
+#' @param ... Space for additional arguments, not currently used.
+#' @details
+#' This function computes all means and covariances among a set of variables using
+#' the Expectation-Maximization (EM) algorithm to handle missing values, and assuming
+#' multivariate normality. The EM code was originally developed for the precision
+#' matrix parameterization (\code{\link{em.prec}}), i.e., the parameters are the
+#' means and the inverse of the covariance matrix. But, this is easily modifiable
+#' to handle a covariance matrix parameterization such that means and covariances
+#' are the model parameters.
+#' 
+#' For starting values, the function accepts either a list that has \code{mu} and
+#' \code{S} slots corresponding to the starting mean and covariance matrix. This
+#' is useful if the user would like to use custom starting values. Otherwise, a
+#' character corresponding to any of the options available in the
+#' \code{\link{startvals.cov}} function will be used to take a guess at starting values.
+#' 
+#' @return
+#' A list with the following:
+#' \itemize{
+#'  \item{\code{p.est}: all parameter estimates as a vector (means followed by unique elements of precision matrix).}
+#'  \item{\code{mu}: estimated means.}
+#'  \item{\code{S}: estimated covariance matrix.}
+#'  \item{\code{it}: number of EM cycles completed.}
+#'  \item{\code{conv}: boolean value indicating convergence (TRUE) or not (FALSE).}
+#' }
+#' 
+#' @importFrom stats cov
+#' @importFrom lavaan lav_matrix_vechr lav_matrix_vechr_reverse
+#' @importFrom matrixcalc is.positive.definite is.symmetric.matrix
+#' @importFrom Matrix nearPD forceSymmetric
+#' @importFrom Rcpp evalCpp
+#' @useDynLib EMgaussian
+#' @export
+#' @examples
+#' \donttest{
+#'   library(psych)
+#'   data(bfi)
+#'   test <- em.cov(bfi[,1:25])
+#' }
+#' 
+##########################################################################################
+# EM algorithm originally in Stadler & Buhlmann (2012), which is on missing data w/ Gaussian graphical model
+# Try implementing their algorithm. Section 2.3.2
+# Just remove the glasso part and we have a saturated mean and covariance matrix
+em.cov <- function(dat, max.iter = 500, tol=1e-5, start=c("diag","pairwise","listwise","full"), debug=0,
+                         ...){
+
+  dat <- as.matrix(dat)
+
+  # obtain starting values  
+  if(is.list(start) && is.vector(start$mustart) && is.matrix(start$covstart)){
+    mustart <- start$mustart
+    covstart <- start$covstart
+  } else if (is.character(start)) {
+    start <- startvals.cov(dat, start)
+    mustart <- start$mustart
+    covstart <- start$covstart
+  }
+
+  # track change in means and covariances
+  mu.est <- as.matrix(mustart)
+  S.est.mat <- covstart
+  S.est <- lav_matrix_vechr(S.est.mat)
+  p.est <- c(mustart, S.est)
+
+  # loop until convergence
+  conv<-FALSE
+  for(it in 1:max.iter){
+    if(debug>1){
+      print(paste0('iter ',it))
+    }
+
+    # Do one EM cycle
+    EMres <- EMcyclecov(dat, mu.est, S.est.mat)
+    mu.est <- EMres$mu
+    S.est.mat <- EMres$S
+
+    # Check S
+    if(!is.symmetric.matrix(S.est.mat)){
+      S.est.mat<-as.matrix(forceSymmetric(S.est.mat))
+      warnings("Non-symmetric S found after E step. Could indicate identification or estimation problem.")
+    }
+
+    # force positive-definite S
+    if(!is.positive.definite(S.est.mat)){
+      S.est.mat<-as.matrix(nearPD(S.est.mat)$mat)
+      warnings("Non-positive definite S found after E step. Could indicate identification or estimation problem.")
+    }
+
+    # Save estimates
+    S.est <- lav_matrix_vechr(S.est.mat)
+    p.old <- p.est
+    p.est <- c(mu.est,S.est)
+
+    # check for convergence
+    if(debug>2){print(max(abs(p.old-p.est)))}
+    if(max(abs(p.old-p.est)) < tol){
+      conv<-TRUE
+      break
+    }
+  }
+
+  out<-list(p.est=p.est, mu=mu.est, S=S.est.mat, it=it, conv=conv)
+  return(out)
+
+}
+
+
+#' Starting values for means and covariances
+#' 
+#' @param dat Data frame or matrix that contains the raw data.
+#' @param start Starting value method (see details).
+#' @details
+#' Attempts to figure out a starting values for the means and covariances for use
+#' with other functions that do the EM algorithm such as \code{\link{em.prec}} or
+#' \code{\link{em.cov}}. Note that means are determined univariately using all
+#' available cases. For covariances, several options are available:
+#' 
+#' - "diag" Use all available complete values to compute the variances of each variable and construct a diagonal covariance matrix.
+#' - "pairwise" Pairwise (co)variances will be used to construct the starting covariance matrix.
+#' - "listwise" Listwise deletion will be used and only those with complete data will contribute to the starting covariance matrix.
+#' - "full" Cheat and use \code{lavaan} to obtain direct maximum likelihood estimates of covariances. This defeats the purpose to some extent, but not that \code{lavaan} may be quite slow compared to this implementation.
+#' 
+#' @return
+#' A list consisting of:
+#' \itemize{
+#'   \item{\code{mustart} - starting values for means.}
+#'   \item{\code{covstart} - starting values for covariances.}
+#' }
+#' @importFrom stats cov
+#' @importFrom lavaan lavCor
+#' @importFrom matrixcalc is.positive.definite
+#' @importFrom Matrix nearPD
+#' @export
+#' @examples
+#' \donttest{
+#'   library(psych)
+#'   data(bfi)
+#'   startvals.cov(bfi[,1:25])
+#' }
+startvals.cov <- function(dat, start=c("diag","pairwise","listwise","full")){
+
+  start <- match.arg(start)
+
+  dat <- as.matrix(dat)
+
+  # obtain starting values
+  mustart <- colMeans(dat,na.rm=T)
+  if(start=="listwise"){
+    covstart <- cov(dat, use="complete.obs")    
+  } else if (start=="pairwise"){
+    covstart <- cov(dat,use="pairwise.complete.obs")
+  } else if (start=="full"){
+    covstart <- lavCor(as.data.frame(dat), missing="ml", output="cov")
+  } else if (start=="diag"){
+    covstart <- diag(diag(cov(dat,use="pairwise.complete.obs")))
+  }
+
+  if(any(is.na(covstart))){
+    covstart[is.na(covstart)]<-0
+    warning("Some NAs in starting vals for covariance matrix. Replacing with zeros.")
+  }
+
+  # force positive-definite covstart
+  if(!is.positive.definite(covstart)){
+    covstart<-as.matrix(nearPD(covstart)$mat)
+  }
+
+  out <- list(mustart = mustart,
+              covstart = covstart)
+
+  return(out)
+
+}