version 0.2.1

cran · Dec 9, 2020 · 1e3096d · 1e3096d
1 parent a4ec7ad
commit 1e3096d
Show file tree

Hide file tree

Showing 15 changed files with 721 additions and 80 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: gsbm
 Type: Package
 Title: Estimate Parameters in the Generalized SBM
-Version: 0.1.1
+Version: 0.2.1
 Authors@R: c(person("Genevieve", "Robin", role = c("aut"), email = "genevieve.robin@inria.fr"),
   person("Solenne", "Gaucher", role = c("aut", "cre"), email = "solenne.gaucher@math.u-psud.fr"))
 Maintainer: Solenne Gaucher <solenne.gaucher@math.u-psud.fr>
@@ -11,12 +11,13 @@ Encoding: UTF-8
 LazyData: true
 RoxygenNote: 7.1.1
 Depends: R (>= 3.5.0)
-Imports: softImpute, RSpectra
-Suggests: knitr, rmarkdown, igraph, missSBM, RColorBrewer, Matrix
+Imports: softImpute, RSpectra, doParallel, Matrix, foreach
+Suggests: knitr, rmarkdown, igraph, missSBM, RColorBrewer, combinat,
+        testthat
 VignetteBuilder: knitr
 NeedsCompilation: no
-Packaged: 2020-07-09 14:58:34 UTC; Mathilde
+Packaged: 2020-12-08 11:27:58 UTC; Mathilde
 Author: Genevieve Robin [aut],
   Solenne Gaucher [aut, cre]
 Repository: CRAN
-Date/Publication: 2020-07-09 15:30:03 UTC
+Date/Publication: 2020-12-09 10:30:02 UTC
diff --git a/MD5 b/MD5
@@ -1,30 +1,32 @@
-835ff16c981970368443155b54be336e *DESCRIPTION
-95e4d40dcbac84615de626357dbc6025 *NAMESPACE
-6662814dad71b280d85b7a269e2a3191 *NEWS.md
+ad8f5978ec55d4c4b2f0ae1716a77c97 *DESCRIPTION
+148eb1c6412f98c43cc1768d4756667e *NAMESPACE
+eb8f1bd259e9d4c1018dc9cf587f362b *NEWS.md
 8cdccfe9a53a8a5f74161a795652fdc2 *R/PrimarySchool.R
 3320634fb68f02db006eabd5c2a181dc *R/blogosphere.R
 aea7baf9d052143808421a1f0019da37 *R/cross_validation.R
-a2035fd85404e3262ce8b4529e400fde *R/gsbm_mcgd.R
+b2175eb09ba8852b09ef7b25445c0628 *R/gsbm_mcgd.R
+0f81378c38c49403bbb99d5550973494 *R/gsbm_mcgd_parallel.R
 e30e19116c3fbe577515de4126558fca *R/les_miserables.R
 08237c350054a60addcb4cc151fd0c40 *README.md
 707c46ee4a9f262a6749cd8555d8a9cd *build/vignette.rds
 744ed95ada5aa4e5a67c7f42db396936 *data/PrimarySchool.RData
 ae50a8f2a07c919a73bb03be9793c5c8 *data/blogosphere.RData
 7ee26a6f1cfe9a0bf05d1aa40157c96b *data/les_miserables.RData
-8f92876665be71b05cc23f92f057a6de *inst/doc/PrimarySchool.R
-3ea91d2031c50ab397dece245e77cc34 *inst/doc/PrimarySchool.Rmd
-018895fe51c1d2848de14504046ed044 *inst/doc/PrimarySchool.html
+ea91f5b567103d674ea4808600a93c3f *inst/doc/PrimarySchool.R
+ee0b9cb0e76c0e2e7d1431f8e4eac063 *inst/doc/PrimarySchool.Rmd
+75ddd567fa5b2b2269d932357391a15d *inst/doc/PrimarySchool.html
 d774debe9d53de46fb848857145028d1 *inst/doc/blogosphere.R
 8a2a3d1db74e3b89111cde36777492fa *inst/doc/blogosphere.Rmd
 9fef1ede66e885c5f1720aa767293c82 *inst/doc/blogosphere.html
-9be011b77a215825543bc5ac850587e0 *inst/doc/les_miserables.R
-2f967a4a35b04bfe06e82bf3447d17be *inst/doc/les_miserables.Rmd
-5452fe681d95d93a435f620f3c3fe353 *inst/doc/les_miserables.html
+caf92677d5e9e88cef4147020b6846ff *inst/doc/les_miserables.R
+3eaf210648454500932b413b019771b8 *inst/doc/les_miserables.Rmd
+7cdd9e75f730dec98ac0addb8703e1a5 *inst/doc/les_miserables.html
 387d3146c267defe49f511cd5fb3f3f5 *man/PrimarySchool.Rd
 834669cfcb12d36058568906af07b265 *man/blogosphere.Rd
 c1198af29f06d597b2ab152f8b3fd0f0 *man/crossval.Rd
 21958ba85a99eb0570e20609164b6a17 *man/gsbm_mcgd.Rd
+7299c273ba49ecfbaeb0f23840926357 *man/gsbm_mcgd_parallel.Rd
 9d7704008efb5d4e32db59f35cb7a08e *man/les_miserables.Rd
-3ea91d2031c50ab397dece245e77cc34 *vignettes/PrimarySchool.Rmd
+ee0b9cb0e76c0e2e7d1431f8e4eac063 *vignettes/PrimarySchool.Rmd
 8a2a3d1db74e3b89111cde36777492fa *vignettes/blogosphere.Rmd
-2f967a4a35b04bfe06e82bf3447d17be *vignettes/les_miserables.Rmd
+3eaf210648454500932b413b019771b8 *vignettes/les_miserables.Rmd
diff --git a/NAMESPACE b/NAMESPACE
@@ -2,6 +2,12 @@
 
 export(crossval)
 export(gsbm_mcgd)
+export(gsbm_mcgd_parallel)
+import(Matrix)
+import(doParallel)
+import(foreach)
+import(parallel)
 importFrom(RSpectra,eigs_sym)
 importFrom(softImpute,softImpute)
 importFrom(stats,aggregate)
+importFrom(utils,globalVariables)
diff --git a/NEWS.md b/NEWS.md
@@ -2,6 +2,4 @@
 
 ## Minor update
 
-Computationally more efficient version of the function gsbm_mcgd
-
-New vignette on the application of GSBM to analyze interactions in a primary school
+Computationally more efficient version of the function gsbm_mcgd.
diff --git a/R/gsbm_mcgd.R b/R/gsbm_mcgd.R
@@ -70,9 +70,10 @@ gsbm_mcgd <- function(A, lambda1, lambda2, epsilon=0.1, U = NULL, maxit = 100, t
   Omega <- !is.na(A)
   if(is.null(S0)) {
     S0 <- matrix(0,n,n)
+    #S0 <- as(Matrix(0,n,n, sparse=T), "dgCMatrix")
   }
-  if(is.null(L0)) L0 <- matrix(0,n,n)
-  if(is.null(R0)) R0 <- sqrt((eigs_sym(L0, 1)$values)^2)
+  if(is.null(L0)) L0 <- matrix(0,n,n)#L0 <- as(Matrix(0,n,n, sparse=T), "dgCMatrix")
+  if(is.null(R0)) R0 <- 0
   if(is.null(U)) {
     U <- (1/2)*sum((A - S0)^2, na.rm = T)/lambda1
   }
@@ -95,6 +96,7 @@ gsbm_mcgd <- function(A, lambda1, lambda2, epsilon=0.1, U = NULL, maxit = 100, t
     flag <- TRUE
     while(flag){
       step <- 0.5*step
+      #mat <- t(pmax(1-step*lambda2/sqrt(colSums((S-step*G_S)^2, na.rm=T)),0)*t(S-G_S))
       mat <- sapply(1:n, function(j){
         max(1-step*lambda2/sqrt(sum((S[,j]-step*G_S[,j])^2, na.rm=T)),0)*(S[,j]-step*G_S[,j])
       })
@@ -103,7 +105,7 @@ gsbm_mcgd <- function(A, lambda1, lambda2, epsilon=0.1, U = NULL, maxit = 100, t
       flag <- obj > obj0
     }
     S <- mat
-    G_L <- -Omega*(A - S - t(S) - L) + epsilon*L
+    G_L <- -1*Omega*(A - S - t(S) - L) + epsilon*L
     obj0 <- (1/2)*sum((A0-S-t(S)-L)^2, na.rm = T)+lambda1*R+lambda2*sum(sqrt(colSums(S^2)))+epsilon*(norm(L, type="F")^2+norm(S, type="F")^2)
     step <- 2
     flag <- TRUE

diff --git a/R/gsbm_mcgd_parallel.R b/R/gsbm_mcgd_parallel.R
@@ -0,0 +1,101 @@
+#' Fit a Generalized Stochastic Block Model
+#'
+#' @description Given an adjacency matrix with missing observations, the function \code{gsbm_mgcd}
+#' robustly estimates the probabilities of connections between nodes.
+#'
+#' @param A nxn adjacency matrix
+#' @param lambda1 regularization parameter for nuclear norm penalty (positive number)
+#' @param lambda2 regularization parameter for 2,1-norm penalty (positive number)
+#' @param epsilon regularization parameter for the L2-norm penalty (positive number, if NULL, default method is applied)
+#' @param maxit maximum number of iterations (positive integer, if NULL, default method is applied)
+#' @param trace.it whether messages about convergence should be printed (boolean, if NULL, default is FALSE)
+#' @param step_S step size for the gradient step of S parameter (positive number)
+#' @param step_L step size for the gradient step of L parameter (positive number)
+#' @param n_cores number of cores to parallellize on (integer number, default is set with detectCores())
+#' @param save whether or not value of current estimates should be saved at each iteration (boolean)
+#' @param file if save is set to TRUE, name of the folder where current estimates should be saved (character string, file saved in file/L_iter.txt at iteration iter) 
+#'
+#' @return The estimate for the nxn matrix of probabilities of connections between nodes. It is
+#' given as the sum of a low-rank nxn matrix L, corresponding to connections between inlier
+#' nodes, and a column sparse nxn matrix S, corresponding to connections between outlier
+#' nodes and the rest of the network. The matrices L and S are such that
+#'
+#' E(A) = L - diag(L) + S + S'
+#'
+#' where E(A) is the expectation of the adjacency matrix, diag(L) is a nxn diagonal
+#' matrix with diagonal entries equal to those of L, and S' means S transposed.
+#'
+#' The return value is a list of components
+#'   \itemize{
+#'   \item{\code{A}}{ the adjacency matrix.}
+#'   \item{\code{L}}{ estimate for the low-rank component.}
+#'   \item{\code{S}}{ estimate for the column-sparse component.}
+#'   \item{\code{objective}}{ the value of the objective function.}
+#'   \item{\code{R}}{ a bound on the nuclear norm of the low-rank component.}
+#'   \item{\code{iter}}{ number of iterations between convergence of the objective function.}
+#'  }
+#'
+#' @importFrom stats aggregate
+#' @importFrom RSpectra eigs_sym
+#' @importFrom utils globalVariables
+#' @import parallel 
+#' @import foreach
+#' @import doParallel 
+#' @import Matrix
+#' @export
+#'
+
+gsbm_mcgd_parallel <- function(A, lambda1, lambda2, epsilon=0.1, maxit = 100, 
+                               step_L = 1e-2, step_S=0.1, trace.it = FALSE, 
+                               n_cores=detectCores(), save = FALSE, file=NULL){
+  n <- nrow(A)
+  S <- Matrix(0, n, n, sparse=TRUE)
+  L <- Matrix(0, n, n, sparse=TRUE)
+  R <- 0
+  U <- (1/2)*sum((A)^2, na.rm = T)/lambda1
+  iter <- 0
+  Omega <- mcmapply(function(i){!is.na(A[,i])}, 1:n, mc.cores=n_cores)
+  A[is.na(A)] <- 0 #mcmapply(function(i){A[i,][is.na(A[i,])] <- 0; return(A[i,])}, 1:n, mc.cores=n_cores)
+  cl <- makeForkCluster(nnodes = n_cores)
+  registerDoParallel(cl)
+  obj <- foreach(i = 1:n,.combine=c) %dopar% {sum(0.5*Omega[i,]*(A[i,])^2, na.rm=T)}
+  objective <- sum(obj)+lambda1*R
+  while(iter < maxit){
+    iter <- iter + 1
+    G_S <- mcmapply(function(i){-2*Omega[,i]*(A[,i] - L[,i]-S[i,]-S[,i])+epsilon*S[,i]}, 1:n, mc.cores=n_cores)
+    S <- mcmapply(function(i){
+      max(0,1-step_S*lambda2/norm(S[,i]-step_S*G_S[,i], type="2"))*(S[,i]-step_S*G_S[,i])
+    }, 1:n, mc.cores = n_cores)
+    G_L <- mcmapply(function(i){-Omega[,i]*(A[,i] - L[,i]-S[,i]-S[i,])+epsilon*L[,i]}, 1:n, mc.cores=n_cores)
+    svdL <- eigs_sym(G_L, 1)
+    D_t <- - sign(svdL$value)*svdL$vectors%*%t(svdL$vectors)
+    if(lambda1 >= - sum(D_t*G_L)){
+        R_tilde <- 0
+        L_tilde <- Matrix(0,n,n, sparse=TRUE)
+        L <- L + step_L*(L_tilde - L)
+        R <- R + step_L*(R_tilde - R)
+      } else{
+        R_tilde <- U
+        L_tilde <- U*D_t
+        L <- L + step_L*(L_tilde - L)
+        R <- R + step_L*(R_tilde - R)
+      }
+    obj <- foreach(i = 1:n,.combine=c) %dopar% {sum(0.5*Omega[i,]*(A[i,]-S[i,]-S[,i]-L[i,])^2, na.rm=T)+lambda2*sqrt(sum(S[,i]^2))+epsilon*sum(L[i,]^2)+epsilon*sum(S[,i]^2)}
+    obj <- sum(obj)+lambda1*R
+    objective <- c(objective, obj)
+    U <- obj/lambda1
+    if(trace.it && (iter%%10 == 0) ){
+      print(paste("iter ", iter))
+    }
+    if(save){
+      save(S, file=paste(file, "/S_", iter,".Rdata", sep=""))
+      save(L, file=paste(file, "/L_", iter,".Rdata", sep=""))
+      save(objective, file=paste(file, "/objective_", iter,".Rdata", sep=""))
+    }
+
+  }
+  stopCluster(cl)
+  return(list(A = A, L = L, S = S, objective = objective, R = R))
+}
+
+globalVariables(c("i"))
diff --git a/inst/doc/PrimarySchool.R b/inst/doc/PrimarySchool.R
@@ -9,7 +9,108 @@ library(Matrix)
 library(gsbm)
 library(igraph)
 library(RColorBrewer)
+library(combinat)
 
 data(PrimarySchool)
-summary(PrimarySchool)
+A <- PrimarySchool$A
+class <- PrimarySchool$class
+n <- dim(A)[1]
+class.names <- levels(class)
+class.effectifs <- table(class)
+
+## ----compute frequency of interaction-----------------------------------------
+# Compute frequency of interactions
+K = length(class.names)
+Q = matrix(rep(0, K*K), ncol = K, nrow = K)
+for (k1 in 1:K){
+  com1 = class.names[k1]
+  for (k2 in 1:K){
+    com2 = class.names[k2]
+    Q[k1, k2] <- mean(A[class == com1, class == com2], na.rm = TRUE)
+    Q[k2, k1] <- Q[k1, k2]
+  }
+}
+Q <- round(Q, 2)
+colnames(Q) <- class.names
+rownames(Q) <- class.names
+print(Q)
+
+## ----Plot network-------------------------------------------------------------
+# Vertex labels
+v.label <- rep(NA, n)
+
+# Graph without NA's
+A.noNA <- matrix(0, ncol = n, nrow = n)
+A.noNA[!is.na(A)] <- A[!is.na(A)]
+g <- graph_from_adjacency_matrix(A.noNA, mode = "undirected")
+
+# Layout
+W<- matrix(0, n, n)
+for (i in 1:n){
+  for (j in 1:n){
+      if ((!is.na(A[i,j])) && A[i,j]==1){
+        if (class[i]==class[j] && class[i]!= "teachers"){
+          W[i,j] <- 2
+        }
+        else{
+          W[i,j] <- 1
+        }
+      }
+  }
+}
+gw <- graph_from_adjacency_matrix(W, mode = "undirected")
+lay <- layout_with_fr(gw)
+pal <- brewer.pal(11, "Paired")
+
+# Plot network
+plot(g, vertex.label = v.label, vertex.size = (2 + colSums(A, na.rm =T)/5), vertex.color = class, layout = lay, palette = pal, edge.width = 0.3, edge.curved=0.1)
+
+legend("bottomleft", legend = class.names, col = pal, pch = 1, bty = 'n', cex = 0.5, ncol = 1)
+
+## ----run GSBM-----------------------------------------------------------------
+# Investigate these outliers
+T1<-Sys.time()
+res <- gsbm_mcgd(A, 8.5, 8, maxit = 100)
+T2<-Sys.time()
+Tdiff= difftime(T1, T2)
+print(paste0("running time : ", Tdiff))
+outliers <- which(colSums(res$S)>0)
+no <- length(outliers)
+outliers.class <- class[outliers]
+outliers.Q <- matrix(0,nrow = no, ncol = K)
+for (o in 1:no){
+  for (k in 1:K){
+    comk = class.names[k]
+    outliers.Q[o,k] <- mean(A[outliers[o],class == comk], na.rm = T)
+  }
+  print(paste0("node ", outliers[o], " is in class ", outliers.class[o]))
+  print(outliers.Q[o,], 2)
+  print("")
+}
+
+## ----recover the classes------------------------------------------------------
+# Compute svd and run k-means
+L.U <- svd(res$L[-outliers, -outliers], nu = 10)$u 
+est_class <- kmeans(L.U, 10, nstart = 100)$cluster
+length(est_class)
+
+# Recover the class among the permutations of labels
+equ_label = combinat::permn(1:10)
+error <- rep(0, length(equ_label))
+for (i in 1:length(equ_label)){
+  error[i] <- sum(class[-outliers] != (class.names[unlist(equ_label[i])])[est_class])
+}
+error[which.min(error)]
+class_est <- class.names[unlist(equ_label[which.min(error)])][est_class]
+class_est_out <- rep(NA, n)
+class_est_out[outliers] <- "outlier"
+class_est_out[-outliers] <- class_est
+class_est_out <- as.factor(class_est_out)
+
+# Plot network
+plot(g, vertex.label = v.label, vertex.size = (2 + colSums(A, na.rm =T)/5), vertex.color = class_est_out, layout = lay, palette = pal, edge.width = 0.3, edge.curved=0.1)
+
+class_est_out <- as.factor(class_est_out)
+
+