Skip to content

Commit

Permalink
version 0.2.1
Browse files Browse the repository at this point in the history
  • Loading branch information
falkcarl authored and cran-robot committed Mar 5, 2024
0 parents commit eb506d2
Show file tree
Hide file tree
Showing 23 changed files with 2,950 additions and 0 deletions.
30 changes: 30 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
Package: EMgaussian
Encoding: UTF-8
Type: Package
Title: Expectation-Maximization Algorithm for Multivariate Normal
(Gaussian) with Missing Data
Version: 0.2.1
Date: 2024-02-15
Authors@R:
c(person(given = c("Carl","F."),
family = "Falk",
role = c("cre","aut"),
email = "carl.falk@mcgill.ca"))
Maintainer: Carl F. Falk <carl.falk@mcgill.ca>
Description: Initially designed to distribute code for estimating the Gaussian
graphical model with Lasso regularization, also known as the graphical lasso
(glasso), using an Expectation-Maximization (EM) algorithm based on work by
Städler and Bühlmann (2012) <doi:10.1007/s11222-010-9219-7>. As a byproduct,
code for estimating means and covariances (or the precision matrix) under a
multivariate normal (Gaussian) distribution is also available.
License: GPL (>= 3)
Imports: Rcpp, matrixcalc, Matrix, lavaan, glasso, glassoFast, caret
Suggests: testthat (>= 3.0.0), psych, bootnet, qgraph, cglasso
LinkingTo: Rcpp, RcppArmadillo
Config/testthat/edition: 3
RoxygenNote: 7.3.1
NeedsCompilation: yes
Packaged: 2024-03-01 15:07:00 UTC; CarlFalkMcGill
Author: Carl F. Falk [cre, aut]
Repository: CRAN
Date/Publication: 2024-03-04 10:50:08 UTC
22 changes: 22 additions & 0 deletions MD5
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
a178f9f5f70605eb5789e4c7ffd5bc6f *DESCRIPTION
daafa4fda98cf384bd33ecd2a3d230fc *NAMESPACE
cf8bdc119d422fea64335ea290ec6d01 *R/RcppExports.R
e1d186a05a5c04b6a41a70bc032773c6 *R/emcov.R
1dd577a4eeb64734cc808296667dd3e6 *R/emprec.R
7cab095ff6848adedcf1952a8aa59cd5 *R/ggm.R
b943a1546a1e17be4eea235ec7b4616b *build/partial.rdb
ed2e9e01256568618b6951a2b224dd5c *man/EMggm.Rd
e4c699a9c8ce8c73baa16518daf6ae42 *man/em.cov.Rd
d260e8e6ad96ddb00fdd7c279b1f1fa1 *man/em.prec.Rd
5d45932fd026b43e72112847fd9361e0 *man/rhogrid.Rd
8cd9e64f2037947977bdcaed49c3b55f *man/startvals.cov.Rd
e62585629948e7dd143efbad43c2a6eb *src/Makevars
e62585629948e7dd143efbad43c2a6eb *src/Makevars.win
e19a1f9d4db95428293ee05cd2e62cdf *src/RcppExports.cpp
a60ed2b870b65be4b8619f7750dca635 *src/rcpp_EMcov.cpp
dd8f744419f5780f285d0412154d07de *src/rcpp_EMprec.cpp
63ec83a77b368f463f9278ee3f9ed82f *tests/testthat.R
c0dd06c77dff422f2f36fc10d8649d8d *tests/testthat/_snaps/em.md
42762f7a0c3b6eee6494f3ffaa5f24c8 *tests/testthat/_snaps/glasso.md
da5685828204bc3fadd6de6dc8f57b3c *tests/testthat/test-em.R
5db3f4e35b8193bce6f460bc9e78bde0 *tests/testthat/test-glasso.R
21 changes: 21 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Generated by roxygen2: do not edit by hand

export(EMggm)
export(em.cov)
export(em.prec)
export(rhogrid)
export(startvals.cov)
importFrom(Matrix,forceSymmetric)
importFrom(Matrix,nearPD)
importFrom(Rcpp,evalCpp)
importFrom(caret,createFolds)
importFrom(glasso,glasso)
importFrom(glassoFast,glassoFast)
importFrom(lavaan,lavCor)
importFrom(lavaan,lav_matrix_vechr)
importFrom(lavaan,lav_matrix_vechr_reverse)
importFrom(matrixcalc,is.positive.definite)
importFrom(matrixcalc,is.symmetric.matrix)
importFrom(stats,cov)
importFrom(stats,cov2cor)
useDynLib(EMgaussian)
35 changes: 35 additions & 0 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# Generated by using Rcpp::compileAttributes() -> do not edit by hand
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393

imp1matcov <- function(D, muest, sigest) {
.Call('_EMgaussian_imp1matcov', PACKAGE = 'EMgaussian', D, muest, sigest)
}

imp2matcov <- function(D, sigest, t2) {
invisible(.Call('_EMgaussian_imp2matcov', PACKAGE = 'EMgaussian', D, sigest, t2))
}

EMcyclecov <- function(D, muest, sigest) {
.Call('_EMgaussian_EMcyclecov', PACKAGE = 'EMgaussian', D, muest, sigest)
}

nllcov <- function(d, muest, sigest) {
.Call('_EMgaussian_nllcov', PACKAGE = 'EMgaussian', d, muest, sigest)
}

imp1matprec <- function(D, muest, kest) {
.Call('_EMgaussian_imp1matprec', PACKAGE = 'EMgaussian', D, muest, kest)
}

imp2matprec <- function(D, kest, t2) {
invisible(.Call('_EMgaussian_imp2matprec', PACKAGE = 'EMgaussian', D, kest, t2))
}

EMcycleprec <- function(D, muest, kest) {
.Call('_EMgaussian_EMcycleprec', PACKAGE = 'EMgaussian', D, muest, kest)
}

nllprec <- function(d, muest, kest) {
.Call('_EMgaussian_nllprec', PACKAGE = 'EMgaussian', d, muest, kest)
}

197 changes: 197 additions & 0 deletions R/emcov.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,197 @@
######################################################################
##
## Copyright 2019-2024 Carl F. Falk
##
## This program is free software: you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation, either version 3 of
## the License, or (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
## <http://www.gnu.org/licenses/>

#' EM algorithm for multivariate normal, covariance matrix parameterization
#'
#' @param dat Data frame or matrix that contains the raw data.
#' @param max.iter Max number of EM cycles.
#' @param tol Tolerance for change in parameter estimates across EM Cycles. If
#' all changes are less than \code{tol}, the algorithm terminates.
#' @param start Starting value method (see details).
#' @param debug (Experimental) set an integer value > 1 for some information as the algorithm runs.
#' @param ... Space for additional arguments, not currently used.
#' @details
#' This function computes all means and covariances among a set of variables using
#' the Expectation-Maximization (EM) algorithm to handle missing values, and assuming
#' multivariate normality. The EM code was originally developed for the precision
#' matrix parameterization (\code{\link{em.prec}}), i.e., the parameters are the
#' means and the inverse of the covariance matrix. But, this is easily modifiable
#' to handle a covariance matrix parameterization such that means and covariances
#' are the model parameters.
#'
#' For starting values, the function accepts either a list that has \code{mu} and
#' \code{S} slots corresponding to the starting mean and covariance matrix. This
#' is useful if the user would like to use custom starting values. Otherwise, a
#' character corresponding to any of the options available in the
#' \code{\link{startvals.cov}} function will be used to take a guess at starting values.
#'
#' @return
#' A list with the following:
#' \itemize{
#' \item{\code{p.est}: all parameter estimates as a vector (means followed by unique elements of precision matrix).}
#' \item{\code{mu}: estimated means.}
#' \item{\code{S}: estimated covariance matrix.}
#' \item{\code{it}: number of EM cycles completed.}
#' \item{\code{conv}: boolean value indicating convergence (TRUE) or not (FALSE).}
#' }
#'
#' @importFrom stats cov
#' @importFrom lavaan lav_matrix_vechr lav_matrix_vechr_reverse
#' @importFrom matrixcalc is.positive.definite is.symmetric.matrix
#' @importFrom Matrix nearPD forceSymmetric
#' @importFrom Rcpp evalCpp
#' @useDynLib EMgaussian
#' @export
#' @examples
#' \donttest{
#' library(psych)
#' data(bfi)
#' test <- em.cov(bfi[,1:25])
#' }
#'
##########################################################################################
# EM algorithm originally in Stadler & Buhlmann (2012), which is on missing data w/ Gaussian graphical model
# Try implementing their algorithm. Section 2.3.2
# Just remove the glasso part and we have a saturated mean and covariance matrix
em.cov <- function(dat, max.iter = 500, tol=1e-5, start=c("diag","pairwise","listwise","full"), debug=0,
...){

dat <- as.matrix(dat)

# obtain starting values
if(is.list(start) && is.vector(start$mustart) && is.matrix(start$covstart)){
mustart <- start$mustart
covstart <- start$covstart
} else if (is.character(start)) {
start <- startvals.cov(dat, start)
mustart <- start$mustart
covstart <- start$covstart
}

# track change in means and covariances
mu.est <- as.matrix(mustart)
S.est.mat <- covstart
S.est <- lav_matrix_vechr(S.est.mat)
p.est <- c(mustart, S.est)

# loop until convergence
conv<-FALSE
for(it in 1:max.iter){
if(debug>1){
print(paste0('iter ',it))
}

# Do one EM cycle
EMres <- EMcyclecov(dat, mu.est, S.est.mat)
mu.est <- EMres$mu
S.est.mat <- EMres$S

# Check S
if(!is.symmetric.matrix(S.est.mat)){
S.est.mat<-as.matrix(forceSymmetric(S.est.mat))
warnings("Non-symmetric S found after E step. Could indicate identification or estimation problem.")
}

# force positive-definite S
if(!is.positive.definite(S.est.mat)){
S.est.mat<-as.matrix(nearPD(S.est.mat)$mat)
warnings("Non-positive definite S found after E step. Could indicate identification or estimation problem.")
}

# Save estimates
S.est <- lav_matrix_vechr(S.est.mat)
p.old <- p.est
p.est <- c(mu.est,S.est)

# check for convergence
if(debug>2){print(max(abs(p.old-p.est)))}
if(max(abs(p.old-p.est)) < tol){
conv<-TRUE
break
}
}

out<-list(p.est=p.est, mu=mu.est, S=S.est.mat, it=it, conv=conv)
return(out)

}


#' Starting values for means and covariances
#'
#' @param dat Data frame or matrix that contains the raw data.
#' @param start Starting value method (see details).
#' @details
#' Attempts to figure out a starting values for the means and covariances for use
#' with other functions that do the EM algorithm such as \code{\link{em.prec}} or
#' \code{\link{em.cov}}. Note that means are determined univariately using all
#' available cases. For covariances, several options are available:
#'
#' - "diag" Use all available complete values to compute the variances of each variable and construct a diagonal covariance matrix.
#' - "pairwise" Pairwise (co)variances will be used to construct the starting covariance matrix.
#' - "listwise" Listwise deletion will be used and only those with complete data will contribute to the starting covariance matrix.
#' - "full" Cheat and use \code{lavaan} to obtain direct maximum likelihood estimates of covariances. This defeats the purpose to some extent, but not that \code{lavaan} may be quite slow compared to this implementation.
#'
#' @return
#' A list consisting of:
#' \itemize{
#' \item{\code{mustart} - starting values for means.}
#' \item{\code{covstart} - starting values for covariances.}
#' }
#' @importFrom stats cov
#' @importFrom lavaan lavCor
#' @importFrom matrixcalc is.positive.definite
#' @importFrom Matrix nearPD
#' @export
#' @examples
#' \donttest{
#' library(psych)
#' data(bfi)
#' startvals.cov(bfi[,1:25])
#' }
startvals.cov <- function(dat, start=c("diag","pairwise","listwise","full")){

start <- match.arg(start)

dat <- as.matrix(dat)

# obtain starting values
mustart <- colMeans(dat,na.rm=T)
if(start=="listwise"){
covstart <- cov(dat, use="complete.obs")
} else if (start=="pairwise"){
covstart <- cov(dat,use="pairwise.complete.obs")
} else if (start=="full"){
covstart <- lavCor(as.data.frame(dat), missing="ml", output="cov")
} else if (start=="diag"){
covstart <- diag(diag(cov(dat,use="pairwise.complete.obs")))
}

if(any(is.na(covstart))){
covstart[is.na(covstart)]<-0
warning("Some NAs in starting vals for covariance matrix. Replacing with zeros.")
}

# force positive-definite covstart
if(!is.positive.definite(covstart)){
covstart<-as.matrix(nearPD(covstart)$mat)
}

out <- list(mustart = mustart,
covstart = covstart)

return(out)

}

0 comments on commit eb506d2

Please sign in to comment.