version 3.8-1

cran · May 4, 2023 · 0e17322 · 0e17322
1 parent 96047f0
commit 0e17322
Show file tree

Hide file tree

Showing 10 changed files with 242 additions and 73 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,16 +1,16 @@
 Package: sparseSEM
 Type: Package
 Title: Sparse-Aware Maximum Likelihood for Structural Equation Models
-Version: 3.8
+Version: 3.8-1
 Date: 2023-04-17
 Authors@R: c(person("Anhui", "Huang", role=c("aut","ctb", "cre" ), email="anhuihuang@gmail.com"))
 Maintainer: Anhui Huang <anhuihuang@gmail.com>
 Description: Provides Sparse-aware maximum likelihood for structural equation models in inferring network structure (topology).
 License: GPL
-Packaged: 2023-04-20 21:41:45 UTC; anhui
+Packaged: 2023-04-30 03:29:22 UTC; anhui
 NeedsCompilation: yes
 Repository: CRAN
 Suggests: knitr
 VignetteBuilder: knitr
-Date/Publication: 2023-04-21 12:40:22 UTC
+Date/Publication: 2023-05-04 11:10:02 UTC
 Author: Anhui Huang [aut, ctb, cre]
diff --git a/MD5 b/MD5
@@ -1,8 +1,8 @@
-1b05d8af566d0bffa415ba8a72a06ef7 *DESCRIPTION
+b7e54aa02846438c9b4f0739037123ec *DESCRIPTION
 bb9f69b9616594cc1ef2c9e4eb5e5ac6 *NAMESPACE
-8cd359134055c6ecbaadf520941084f5 *R/elasticNetSML.R
-a584cfc82d00e74072985292295b1182 *R/elasticNetSMLcv.R
-bb70a33048c3ca09bf30ee48faeeaac6 *R/elasticNetSMLpoint.R
+6e6de2a876f1c72a84eb604631aa375f *R/elasticNetSML.R
+e947b0a889d59c929ec417dcbdf1ed0a *R/elasticNetSMLcv.R
+55812cecde6c9c0ccd876bb614910679 *R/elasticNetSMLpoint.R
 cbb8c7e8520229faa4c6a0dd8777fa0a *R/lassoSML.R
 bad019d38fef5c94082020c9de652c61 *build/vignette.rds
 8b29113c7289ac8136cc2af61567dd89 *data/B.rda
@@ -15,15 +15,15 @@ a0d563ead9fa839619f86608431501e4 *inst/doc/sparseSEM.pdf
 e434b8cffaeee8b604eaccc4d6ed6777 *man/Missing.Rd
 7a7bcf5e61f9256b3f7720efceed6a1c *man/X.Rd
 6ca57e348ffdb690bfab41db260ba484 *man/Y.Rd
-9e1594ff77dab78098bdef3ae3b48641 *man/elasticNetSML.Rd
-842e3224e99cc0a3e43a9b44ecbfda59 *man/elasticNetSMLcv.Rd
-06480c625ce4c106a10a28f029f5bd0c *man/elasticNetSMLpoint.Rd
+28259514d7120e1e69aa63918a0d6432 *man/elasticNetSML.Rd
+45c31997b1c0689a683b83929b79a32e *man/elasticNetSMLcv.Rd
+4123592eff8b03878aad52e61326ef95 *man/elasticNetSMLpoint.Rd
 64bb11823214b0162f2a55803bb0b834 *man/lassoSML.Rd
-02d5e802a778d403cfe84adc2a93721a *man/sparseSEM-package.Rd
+6302b4ccb9f7d35e86457a6729ac32a6 *man/sparseSEM-package.Rd
 03cb2d11b8ddc5bd6b6f1f28edf771ee *src/Makevars
 c265ecb513d3138a1b228acaf30e9f44 *src/lassoSEM.c
 e508b1c76529602e9e7b66b0b970bb89 *src/lassoSEM.h
-bedaf16b9268fb2d7d4627c0f0e85072 *src/lassoSMLv10beta.c
+41df57cb9c1bc415555d2afbe90975c4 *src/lassoSMLv10beta.c
 7c22b1b500cab25872924e218f1645f5 *src/registerDynamicSymbol.c
 4e619e3576dcad79dd24601d0d7fe1bc *vignettes/Fig1_nCPU.png
 eabdc3a9c03e6b931f78ddf5c1e23a11 *vignettes/Fig2_sigma01.png

diff --git a/R/elasticNetSML.R b/R/elasticNetSML.R
@@ -2,7 +2,7 @@ elasticNetSML <-
 function(Y,X,Missing,B,Verbose = 0){
 	M = nrow(Y);
 	N = ncol(Y);
-	if(Verbose>=0) cat("\telastic net SML version_1;",M, "Genes, ", N , "samples; Verbose: ", Verbose, "\n\n")
+	if(Verbose>=0) cat("\telastic net SML;",M, "Nodes, ", N , "samples; Verbose: ", Verbose, "\n\n")
 	f = matrix(1,M,1);
 	stat = rep(0,6);
 
@@ -28,7 +28,7 @@ function(Y,X,Missing,B,Verbose = 0){
 	Bout = matrix(output$B,nrow= M, ncol = M, byrow = F);
 	fout = matrix(output$f,nrow= M, ncol = 1, byrow = F);
 	stat = matrix(output$stat,nrow = 6,ncol = 1, byrow = F);
-	stat
+
 
 	SMLresult 			<- list(Bout,fout,stat,simTime[1]);
 	names(SMLresult)	<-c("weight","F","statistics","simTime")

diff --git a/R/elasticNetSMLcv.R b/R/elasticNetSMLcv.R
@@ -1,14 +1,25 @@
 
-elasticNetSMLcv <- function(Y,X,Missing,B,alpha_factors = seq(1,0.05, -0.05), lambda_factors =10^seq(-0.2,-4,-0.2) ,Verbose = 0){
+elasticNetSMLcv <- function(Y,X,Missing,B,alpha_factors = seq(1,0.05, -0.05), lambda_factors =10^seq(-0.2,-4,-0.2), kFold = 5, Verbose = 0){
 	M 					= nrow(Y);
 	N 					= ncol(Y);
-	if(Verbose>=0) cat("\telastic net SML version_1;",M, "Genes, ", N , "samples; Verbose: ", Verbose, "\n\n")
+	if(Verbose>=0) cat("\telastic net SML;",M, "Nodes, ", N , "samples; Verbose: ", Verbose, "\n\n")
 	f 					= matrix(1,M,1);
 	stat 				= rep(0,6);
 #------------------------------------------------------R_package parameter
 	nAlpha 	= length(alpha_factors);
 	nLambda = length(lambda_factors);
-	mseStd 	= rep(0,nLambda*2);
+	mse 	= rep(0,nLambda*nAlpha);
+	mseSte = rep(0,nLambda*nAlpha);
+	mseStd = rep(0, nLambda*2); 
+
+	parameters = matrix(0,0,2);
+	for (i in alpha_factors){
+	  col1 = rep(i,nLambda);
+	  col2 = lambda_factors;
+	  para = cbind(col1,col2);
+	  parameters = rbind(parameters,para)
+	}
+
 #------------------------------------------------------R_package parameter
 
 	#dyn.load("elasticSMLv1.dll")
@@ -26,7 +37,10 @@ elasticNetSMLcv <- function(Y,X,Missing,B,alpha_factors = seq(1,0.05, -0.05), la
 				nAlpha 	= as.integer(nAlpha),
 				lambda 	= as.double(lambda_factors),
 				nLambda = as.integer(nLambda),
+				mse 	= as.double(mse),
+				mseSte 	= as.double(mseSte),
 				mseStd 	= as.double(mseStd),
+				kFold   = as.integer(kFold),
 				verbose = as.integer(Verbose),
 				package = "sparseSEM"); 
 
@@ -39,14 +53,18 @@ elasticNetSMLcv <- function(Y,X,Missing,B,alpha_factors = seq(1,0.05, -0.05), la
 	fout = matrix(output$f,nrow= M, ncol = 1, byrow = F);
 	stat = matrix(output$stat,nrow = 6,ncol = 1, byrow = F);
 #------------------------------------------------------R_package parameter
-	mseStd = matrix(output$mseStd,nrow= nLambda, ncol = 2, byrow = F);
-	mseStd = cbind(lambda_factors,mseStd);
-	colnames(mseStd)<-c("lambda","mean Error", "Std");
+#	mseStd = matrix(output$mseStd,nrow= nLambda, ncol = 2, byrow = F);
+	mse   = matrix(output$mse,nrow = nAlpha*nLambda, ncol =1, byrow= F)
+	mseSte   = matrix(output$mseSte,nrow = nAlpha*nLambda, ncol =1, byrow= F)
+
+
+	cvResults = cbind(parameters,mse,mseSte);
+	colnames(cvResults)<-c("alpha","lambda","mean Error", "Ste");
 #------------------------------------------------------R_package parameter
 
 
-	SMLresult 			<- list(Bout,fout,stat,simTime[1],mseStd);
-	names(SMLresult)	<-c("weight","F","statistics","simTime","residuals")
+	SMLresult 			<- list(Bout,fout,stat,simTime[1],cvResults);
+	names(SMLresult)	<-c("weight","F","statistics","simTime","cv")
 	return(SMLresult)
 }
 
diff --git a/R/elasticNetSMLpoint.R b/R/elasticNetSMLpoint.R
@@ -2,7 +2,7 @@
 elasticNetSMLpoint <- function(Y,X,Missing,B,alpha_factor = 1, lambda_factor =0.01 ,Verbose = 0){
 	M 					= nrow(Y);
 	N 					= ncol(Y);
-	if(Verbose>=0) cat("\telastic net SML version_1;",M, "Genes, ", N , "samples; Verbose: ", Verbose, "\n\n")
+	if(Verbose>=0) cat("\telastic net SML;",M, "Nodes, ", N , "samples; Verbose: ", Verbose, "\n\n")
 	f 					= matrix(1,M,1);
 	stat 				= rep(0,6);
 #------------------------------------------------------R_package parameter

diff --git a/man/elasticNetSML.Rd b/man/elasticNetSML.Rd
@@ -5,24 +5,63 @@
 	The Elastic Net penalty for SEM
 }
 \description{
-	For each alpha from 0.95 to 0.05 at a step of 0.05, the function perform 5 fold CV for 
-	lambda_max to lambda_min in 20 step to determine the optimal alpha and lambda for the data. }
+	Fit the elastic Net penalized SEM model with input data X, Y: Y = BY + fX + e.
+
+	For users new to this package, elasticNetSML provides the simplified entry point: 
+	Missing matrix can be all 0 (none or uknown), B matrix can be all 0 (unknow connections in the network), 
+	thus only Y, X needed.
+
+	Underlying the function, the function obtained the optimal hyperparameter (alpha, lambda) from 
+	k-fold cross validation (CV) with fixed k= 5. and the grid search of candiate alpha and number of 
+	steps for candate lambda are also fixed.  Specifically, for each alpha from 0.95 to 0.05 at
+	a step of 0.05, the function perform 5 fold CV for lambda_max to lambda_min in 20 step 
+	to determine the optimal alpha and lambda for the data. 
+
+	Once users get the idea of what the package and the function does, the package also
+	provide functions with more flexibility to allow users to tune the parameters.  Please
+	see function elasticNetSMLcv() and elasticNetSMLpoint().
+
+	Note that the function is computationally expensive. Generally, the following steps are followed: \cr
+        Step 1. SEM-ridge regression (L2 penalty) with k-fold CV: this step find the optimal ridge hyperparameter rho \cr
+        Step 2. fit SEM reidge regression model (L2 penalty) with rho from Step 1, obtain the initial status (non-sparse)
+                of network structure (B_ridge); \cr
+        Step 3.SEM-elastic net regression with k-fold CV: this step finds the optimal hyperparameter (alpha, lambda) \cr
+        Step 4. fit SEM-elastic net model with (alpha, lambda) from Step 3. \cr
+        Step 5. result calculation for PD, FDR, provide the output.
+
+
+  For large scale network inference, a standalone C/C++ software with openMPI for 
+  parallel computation is also available upon request.
+
+  If you are interested in implementating the algorithm to integrit with Spark, please reach me out.
+	}
 \usage{
 elasticNetSML(Y, X, Missing, B, Verbose = 0)
 }
 %- maybe also 'usage' for other objects documented here.
 \arguments{
   \item{Y}{
-	gene expression M by N matrix
+	The observed node response data with dimension of M by N. Y is normalized inside the function.
 }
   \item{X}{
-	cis_eQTL M by N matrix
+	The network node attribute matrix with dimension of M by N, with M being the number of nodes,
+      and N being the number of samples. Theoretically, X can be L by N matrix, with L being the total
+      node attributes. However, in current implementation, each node only allows one and only one attribute.
+      If you have more than one attributes for some nodes,  please consider selecting the top one by either
+      correlation or principal component methods.
+      X is normalized inside the function.
 }
   \item{Missing}{
-	missing data in Y
+	M by N matrix corresponding to elements of Y. 0 denotes no missing, while 1 denotes missing.
+      If a node j in sample i has a missing label (Missing[j,i] = 1), the node response Y[j,i] is set to 0.
 }
   \item{B}{
-	true network topology if available
+	 For a network with M nodes, B is the M by M adjacency matrix.
+      If data is simulated/with known true network topology (i.e., known adjacency matrix), the Power
+      of detection (PD) and False Discovery Rate (FDR) is computed in the output parameter 'statistics'.
+
+      If the true network topology is unknown, B is optional, and the PD/FDR in output parameter
+      'statistics' should be ignored.
 }
   \item{Verbose}{
 	describe the information output from 0 - 10, larger number means more output
@@ -33,9 +72,28 @@ elasticNetSML(Y, X, Missing, B, Verbose = 0)
 }
 \value{
 
-	\item{Bout}{the matrix B from sparseSEM}
-	\item{fout}{f: the weight for matrix X}
-	\item{stat}{compute the power and FDR statistics if the ture topology is provided}
+	\item{Bout}{
+	the computed weights for the network topology. B[i,j] = 0 means there is no edge between node i and j;
+        B[i,j]!=0 denotes an (undirected) edge between note i and j.
+}
+	\item{fout}{
+	f is 1 by M array keeping the weight for X (in SEM: Y = BY + FX + e). Theoretically, F can be M by L matrix,
+        with M being the number of nodes, and L being the total node attributes. However, in current implementation,
+        each node only allows one and only one attribute.
+        If you have more than one attributes for some nodes, please consider selecting the top one by either
+        correlation or principal component methods.
+
+	}
+	\item{stat}{
+	statistics is 1x6 array keeping record of:
+                1. correct positive
+                2. total positive
+                3. false positive
+                4. positive detected
+                5. Power of detection (PD) = correct positive/total positive
+                6. False Discovery Rate (FDR) = false positive/positive detected
+
+	}
 	\item{simTime}{computational time}
 
 }

diff --git a/man/elasticNetSMLcv.Rd b/man/elasticNetSMLcv.Rd
@@ -6,47 +6,80 @@
 }
 \description{
 	While elasticNetSML function has a set of default (alpha, lambda) and the optimal 
-	one is chosen by 5 fold cv, elasticNetSMLcv tests the combination of a set of 
+	one is chosen by k=5 fold cv, elasticNetSMLcv tests the combination of a set of 
 	alpha an lambda, and choose one as the optimal parameters.
-	elasticNetSMLcv should be combined with elasticNetSMLpoint to obtain the network 
-	inference.
-	For each alpha from the set of alphas provided, the function perform 5 fold CV for 
+
+	For each alpha from the set of alphas provided, the function perform k-fold CV for 
 	each user supplied lambda to determine the optimal alpha and lambda for the data. }
 \usage{
-elasticNetSMLcv(Y, X, Missing, B, alpha_factors,lambda_factors, Verbose)
+elasticNetSMLcv(Y, X, Missing, B, alpha_factors,lambda_factors,kFold, Verbose)
 }
 %- maybe also 'usage' for other objects documented here.
 \arguments{
   \item{Y}{
-	gene expression M by N matrix
+	The observed node response data with dimension of M by N. Y is normalized inside the function.
 }
   \item{X}{
-	cis_eQTL M by N matrix
+	The network node attribute matrix with dimension of M by N, with M being the number of nodes,
+      and N being the number of samples. Theoretically, X can be L by N matrix, with L being the total
+      node attributes. However, in current implementation, each node only allows one and only one attribute.
+      If you have more than one attributes for some nodes,  please consider selecting the top one by either
+      correlation or principal component methods.
+      X is normalized inside the function.
 }
   \item{Missing}{
-	missing data in Y
+	M by N matrix corresponding to elements of Y. 0 denotes no missing, while 1 denotes missing.
+      If a node j in sample i has a missing label (Missing[j,i] = 1), the node response Y[j,i] is set to 0.
 }
   \item{B}{
-	true network topology if available
+	 For a network with M nodes, B is the M by M adjacency matrix.
+      If data is simulated/with known true network topology (i.e., known adjacency matrix), the Power
+      of detection (PD) and False Discovery Rate (FDR) is computed in the output parameter 'statistics'.
+
+      If the true network topology is unknown, B is optional, and the PD/FDR in output parameter
+      'statistics' should be ignored.
 }
   \item{alpha_factors}{
-	alpha_factors: the set of alphas to be tested, and is in range of (0, 1);
+	The set of candidate alpha values.  Default is seq(start = 0.95, to = 0.05, step = -0.05)
 }
   \item{lambda_factors}{
-	penalty lambda_factor: the set of lambda to be tested, and is in range of (0, 1);
+	The set of candidate lambda values. Default is 10^seq(start =1, to = 0.001, step = -0.2)
+}
+  \item{kFold}{
+  k-fold cross validation, default k=5
 }
   \item{Verbose}{
 	describe the information output from 0 - 10, larger number means more output
 }
+
 }
 \details{
 	the function perform CV and parameter inference, calculate power and FDR
 }
 \value{
 
-	\item{Bout}{the matrix B from sparseSEM}
-	\item{fout}{f: the weight for matrix X}
-	\item{stat}{compute the power and FDR statistics if the ture topology is provided}
+	\item{Bout}{
+	the computed weights for the network topology. B[i,j] = 0 means there is no edge between node i and j;
+        B[i,j]!=0 denotes an (undirected) edge between note i and j.
+}
+	\item{fout}{
+	f is 1 by M array keeping the weight for X (in SEM: Y = BY + FX + e). Theoretically, F can be M by L matrix,
+        with M being the number of nodes, and L being the total node attributes. However, in current implementation,
+        each node only allows one and only one attribute.
+        If you have more than one attributes for some nodes, please consider selecting the top one by either
+        correlation or principal component methods.
+
+	}
+	\item{stat}{
+	statistics is 1x6 array keeping record of:
+                1. correct positive
+                2. total positive
+                3. false positive
+                4. positive detected
+                5. Power of detection (PD) = correct positive/total positive
+                6. False Discovery Rate (FDR) = false positive/positive detected
+
+	}
 	\item{simTime}{computational time}
 	\item{residual}{only meaningful for 1 alpha: \cr
 					col1: lambdas; \cr
@@ -64,6 +97,7 @@ elasticNetSMLcv(Y, X, Missing, B, alpha_factors,lambda_factors, Verbose)
 		2) elasticNetSMLcv: user supplied alphas (one or more), lambdas; compute the optimal parameters and network parameters \cr
 		3) elasticNetSMLpoint: user supplied one alpha and one lambda, compute the network parameters 
 
+		User is responsible to set the random seed to guarantee repeatable results.
 }
 
 %% ~Make other sections like Warning with \section{Warning }{....} ~
@@ -75,7 +109,7 @@ elasticNetSMLcv(Y, X, Missing, B, alpha_factors,lambda_factors, Verbose)
 	data(X);
 	data(Missing);
 	\dontrun{OUT <- elasticNetSMLcv(Y, X, Missing, B, alpha_factors = c(0.75, 0.5, 0.25),
-	lambda_factors=c(0.1, 0.01, 0.001), Verbose = 1);
+	lambda_factors=c(0.1, 0.01, 0.001), kFold = 5, Verbose  = 1);
 }
 }
 % Add one or more standard keywords, see file 'KEYWORDS' in the