From 1e3096d64d4a0a2c323a11563e8222f091ef99ea Mon Sep 17 00:00:00 2001 From: Solenne Gaucher Date: Wed, 9 Dec 2020 09:30:02 +0000 Subject: [PATCH] version 0.2.1 --- DESCRIPTION | 11 +-- MD5 | 26 ++++--- NAMESPACE | 6 ++ NEWS.md | 4 +- R/gsbm_mcgd.R | 8 +- R/gsbm_mcgd_parallel.R | 101 +++++++++++++++++++++++++ inst/doc/PrimarySchool.R | 103 ++++++++++++++++++++++++- inst/doc/PrimarySchool.Rmd | 119 ++++++++++++++++++++++++++++- inst/doc/PrimarySchool.html | 143 +++++++++++++++++++++++++++++++++-- inst/doc/les_miserables.R | 5 +- inst/doc/les_miserables.Rmd | 5 +- inst/doc/les_miserables.html | 78 +++++++++---------- man/gsbm_mcgd_parallel.Rd | 68 +++++++++++++++++ vignettes/PrimarySchool.Rmd | 119 ++++++++++++++++++++++++++++- vignettes/les_miserables.Rmd | 5 +- 15 files changed, 721 insertions(+), 80 deletions(-) create mode 100644 R/gsbm_mcgd_parallel.R create mode 100644 man/gsbm_mcgd_parallel.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 2fc4f88..7c940d6 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: gsbm Type: Package Title: Estimate Parameters in the Generalized SBM -Version: 0.1.1 +Version: 0.2.1 Authors@R: c(person("Genevieve", "Robin", role = c("aut"), email = "genevieve.robin@inria.fr"), person("Solenne", "Gaucher", role = c("aut", "cre"), email = "solenne.gaucher@math.u-psud.fr")) Maintainer: Solenne Gaucher @@ -11,12 +11,13 @@ Encoding: UTF-8 LazyData: true RoxygenNote: 7.1.1 Depends: R (>= 3.5.0) -Imports: softImpute, RSpectra -Suggests: knitr, rmarkdown, igraph, missSBM, RColorBrewer, Matrix +Imports: softImpute, RSpectra, doParallel, Matrix, foreach +Suggests: knitr, rmarkdown, igraph, missSBM, RColorBrewer, combinat, + testthat VignetteBuilder: knitr NeedsCompilation: no -Packaged: 2020-07-09 14:58:34 UTC; Mathilde +Packaged: 2020-12-08 11:27:58 UTC; Mathilde Author: Genevieve Robin [aut], Solenne Gaucher [aut, cre] Repository: CRAN -Date/Publication: 2020-07-09 15:30:03 UTC +Date/Publication: 2020-12-09 10:30:02 UTC diff --git a/MD5 b/MD5 index d756c77..99b82c3 100644 --- a/MD5 +++ b/MD5 @@ -1,30 +1,32 @@ -835ff16c981970368443155b54be336e *DESCRIPTION -95e4d40dcbac84615de626357dbc6025 *NAMESPACE -6662814dad71b280d85b7a269e2a3191 *NEWS.md +ad8f5978ec55d4c4b2f0ae1716a77c97 *DESCRIPTION +148eb1c6412f98c43cc1768d4756667e *NAMESPACE +eb8f1bd259e9d4c1018dc9cf587f362b *NEWS.md 8cdccfe9a53a8a5f74161a795652fdc2 *R/PrimarySchool.R 3320634fb68f02db006eabd5c2a181dc *R/blogosphere.R aea7baf9d052143808421a1f0019da37 *R/cross_validation.R -a2035fd85404e3262ce8b4529e400fde *R/gsbm_mcgd.R +b2175eb09ba8852b09ef7b25445c0628 *R/gsbm_mcgd.R +0f81378c38c49403bbb99d5550973494 *R/gsbm_mcgd_parallel.R e30e19116c3fbe577515de4126558fca *R/les_miserables.R 08237c350054a60addcb4cc151fd0c40 *README.md 707c46ee4a9f262a6749cd8555d8a9cd *build/vignette.rds 744ed95ada5aa4e5a67c7f42db396936 *data/PrimarySchool.RData ae50a8f2a07c919a73bb03be9793c5c8 *data/blogosphere.RData 7ee26a6f1cfe9a0bf05d1aa40157c96b *data/les_miserables.RData -8f92876665be71b05cc23f92f057a6de *inst/doc/PrimarySchool.R -3ea91d2031c50ab397dece245e77cc34 *inst/doc/PrimarySchool.Rmd -018895fe51c1d2848de14504046ed044 *inst/doc/PrimarySchool.html +ea91f5b567103d674ea4808600a93c3f *inst/doc/PrimarySchool.R +ee0b9cb0e76c0e2e7d1431f8e4eac063 *inst/doc/PrimarySchool.Rmd +75ddd567fa5b2b2269d932357391a15d *inst/doc/PrimarySchool.html d774debe9d53de46fb848857145028d1 *inst/doc/blogosphere.R 8a2a3d1db74e3b89111cde36777492fa *inst/doc/blogosphere.Rmd 9fef1ede66e885c5f1720aa767293c82 *inst/doc/blogosphere.html -9be011b77a215825543bc5ac850587e0 *inst/doc/les_miserables.R -2f967a4a35b04bfe06e82bf3447d17be *inst/doc/les_miserables.Rmd -5452fe681d95d93a435f620f3c3fe353 *inst/doc/les_miserables.html +caf92677d5e9e88cef4147020b6846ff *inst/doc/les_miserables.R +3eaf210648454500932b413b019771b8 *inst/doc/les_miserables.Rmd +7cdd9e75f730dec98ac0addb8703e1a5 *inst/doc/les_miserables.html 387d3146c267defe49f511cd5fb3f3f5 *man/PrimarySchool.Rd 834669cfcb12d36058568906af07b265 *man/blogosphere.Rd c1198af29f06d597b2ab152f8b3fd0f0 *man/crossval.Rd 21958ba85a99eb0570e20609164b6a17 *man/gsbm_mcgd.Rd +7299c273ba49ecfbaeb0f23840926357 *man/gsbm_mcgd_parallel.Rd 9d7704008efb5d4e32db59f35cb7a08e *man/les_miserables.Rd -3ea91d2031c50ab397dece245e77cc34 *vignettes/PrimarySchool.Rmd +ee0b9cb0e76c0e2e7d1431f8e4eac063 *vignettes/PrimarySchool.Rmd 8a2a3d1db74e3b89111cde36777492fa *vignettes/blogosphere.Rmd -2f967a4a35b04bfe06e82bf3447d17be *vignettes/les_miserables.Rmd +3eaf210648454500932b413b019771b8 *vignettes/les_miserables.Rmd diff --git a/NAMESPACE b/NAMESPACE index c876a7d..e096cdf 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,6 +2,12 @@ export(crossval) export(gsbm_mcgd) +export(gsbm_mcgd_parallel) +import(Matrix) +import(doParallel) +import(foreach) +import(parallel) importFrom(RSpectra,eigs_sym) importFrom(softImpute,softImpute) importFrom(stats,aggregate) +importFrom(utils,globalVariables) diff --git a/NEWS.md b/NEWS.md index 40f8687..2c9a836 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,6 +2,4 @@ ## Minor update -Computationally more efficient version of the function gsbm_mcgd - -New vignette on the application of GSBM to analyze interactions in a primary school \ No newline at end of file +Computationally more efficient version of the function gsbm_mcgd. diff --git a/R/gsbm_mcgd.R b/R/gsbm_mcgd.R index 8beb01d..58e3adf 100644 --- a/R/gsbm_mcgd.R +++ b/R/gsbm_mcgd.R @@ -70,9 +70,10 @@ gsbm_mcgd <- function(A, lambda1, lambda2, epsilon=0.1, U = NULL, maxit = 100, t Omega <- !is.na(A) if(is.null(S0)) { S0 <- matrix(0,n,n) + #S0 <- as(Matrix(0,n,n, sparse=T), "dgCMatrix") } - if(is.null(L0)) L0 <- matrix(0,n,n) - if(is.null(R0)) R0 <- sqrt((eigs_sym(L0, 1)$values)^2) + if(is.null(L0)) L0 <- matrix(0,n,n)#L0 <- as(Matrix(0,n,n, sparse=T), "dgCMatrix") + if(is.null(R0)) R0 <- 0 if(is.null(U)) { U <- (1/2)*sum((A - S0)^2, na.rm = T)/lambda1 } @@ -95,6 +96,7 @@ gsbm_mcgd <- function(A, lambda1, lambda2, epsilon=0.1, U = NULL, maxit = 100, t flag <- TRUE while(flag){ step <- 0.5*step + #mat <- t(pmax(1-step*lambda2/sqrt(colSums((S-step*G_S)^2, na.rm=T)),0)*t(S-G_S)) mat <- sapply(1:n, function(j){ max(1-step*lambda2/sqrt(sum((S[,j]-step*G_S[,j])^2, na.rm=T)),0)*(S[,j]-step*G_S[,j]) }) @@ -103,7 +105,7 @@ gsbm_mcgd <- function(A, lambda1, lambda2, epsilon=0.1, U = NULL, maxit = 100, t flag <- obj > obj0 } S <- mat - G_L <- -Omega*(A - S - t(S) - L) + epsilon*L + G_L <- -1*Omega*(A - S - t(S) - L) + epsilon*L obj0 <- (1/2)*sum((A0-S-t(S)-L)^2, na.rm = T)+lambda1*R+lambda2*sum(sqrt(colSums(S^2)))+epsilon*(norm(L, type="F")^2+norm(S, type="F")^2) step <- 2 flag <- TRUE diff --git a/R/gsbm_mcgd_parallel.R b/R/gsbm_mcgd_parallel.R new file mode 100644 index 0000000..c884c90 --- /dev/null +++ b/R/gsbm_mcgd_parallel.R @@ -0,0 +1,101 @@ +#' Fit a Generalized Stochastic Block Model +#' +#' @description Given an adjacency matrix with missing observations, the function \code{gsbm_mgcd} +#' robustly estimates the probabilities of connections between nodes. +#' +#' @param A nxn adjacency matrix +#' @param lambda1 regularization parameter for nuclear norm penalty (positive number) +#' @param lambda2 regularization parameter for 2,1-norm penalty (positive number) +#' @param epsilon regularization parameter for the L2-norm penalty (positive number, if NULL, default method is applied) +#' @param maxit maximum number of iterations (positive integer, if NULL, default method is applied) +#' @param trace.it whether messages about convergence should be printed (boolean, if NULL, default is FALSE) +#' @param step_S step size for the gradient step of S parameter (positive number) +#' @param step_L step size for the gradient step of L parameter (positive number) +#' @param n_cores number of cores to parallellize on (integer number, default is set with detectCores()) +#' @param save whether or not value of current estimates should be saved at each iteration (boolean) +#' @param file if save is set to TRUE, name of the folder where current estimates should be saved (character string, file saved in file/L_iter.txt at iteration iter) +#' +#' @return The estimate for the nxn matrix of probabilities of connections between nodes. It is +#' given as the sum of a low-rank nxn matrix L, corresponding to connections between inlier +#' nodes, and a column sparse nxn matrix S, corresponding to connections between outlier +#' nodes and the rest of the network. The matrices L and S are such that +#' +#' E(A) = L - diag(L) + S + S' +#' +#' where E(A) is the expectation of the adjacency matrix, diag(L) is a nxn diagonal +#' matrix with diagonal entries equal to those of L, and S' means S transposed. +#' +#' The return value is a list of components +#' \itemize{ +#' \item{\code{A}}{ the adjacency matrix.} +#' \item{\code{L}}{ estimate for the low-rank component.} +#' \item{\code{S}}{ estimate for the column-sparse component.} +#' \item{\code{objective}}{ the value of the objective function.} +#' \item{\code{R}}{ a bound on the nuclear norm of the low-rank component.} +#' \item{\code{iter}}{ number of iterations between convergence of the objective function.} +#' } +#' +#' @importFrom stats aggregate +#' @importFrom RSpectra eigs_sym +#' @importFrom utils globalVariables +#' @import parallel +#' @import foreach +#' @import doParallel +#' @import Matrix +#' @export +#' + +gsbm_mcgd_parallel <- function(A, lambda1, lambda2, epsilon=0.1, maxit = 100, + step_L = 1e-2, step_S=0.1, trace.it = FALSE, + n_cores=detectCores(), save = FALSE, file=NULL){ + n <- nrow(A) + S <- Matrix(0, n, n, sparse=TRUE) + L <- Matrix(0, n, n, sparse=TRUE) + R <- 0 + U <- (1/2)*sum((A)^2, na.rm = T)/lambda1 + iter <- 0 + Omega <- mcmapply(function(i){!is.na(A[,i])}, 1:n, mc.cores=n_cores) + A[is.na(A)] <- 0 #mcmapply(function(i){A[i,][is.na(A[i,])] <- 0; return(A[i,])}, 1:n, mc.cores=n_cores) + cl <- makeForkCluster(nnodes = n_cores) + registerDoParallel(cl) + obj <- foreach(i = 1:n,.combine=c) %dopar% {sum(0.5*Omega[i,]*(A[i,])^2, na.rm=T)} + objective <- sum(obj)+lambda1*R + while(iter < maxit){ + iter <- iter + 1 + G_S <- mcmapply(function(i){-2*Omega[,i]*(A[,i] - L[,i]-S[i,]-S[,i])+epsilon*S[,i]}, 1:n, mc.cores=n_cores) + S <- mcmapply(function(i){ + max(0,1-step_S*lambda2/norm(S[,i]-step_S*G_S[,i], type="2"))*(S[,i]-step_S*G_S[,i]) + }, 1:n, mc.cores = n_cores) + G_L <- mcmapply(function(i){-Omega[,i]*(A[,i] - L[,i]-S[,i]-S[i,])+epsilon*L[,i]}, 1:n, mc.cores=n_cores) + svdL <- eigs_sym(G_L, 1) + D_t <- - sign(svdL$value)*svdL$vectors%*%t(svdL$vectors) + if(lambda1 >= - sum(D_t*G_L)){ + R_tilde <- 0 + L_tilde <- Matrix(0,n,n, sparse=TRUE) + L <- L + step_L*(L_tilde - L) + R <- R + step_L*(R_tilde - R) + } else{ + R_tilde <- U + L_tilde <- U*D_t + L <- L + step_L*(L_tilde - L) + R <- R + step_L*(R_tilde - R) + } + obj <- foreach(i = 1:n,.combine=c) %dopar% {sum(0.5*Omega[i,]*(A[i,]-S[i,]-S[,i]-L[i,])^2, na.rm=T)+lambda2*sqrt(sum(S[,i]^2))+epsilon*sum(L[i,]^2)+epsilon*sum(S[,i]^2)} + obj <- sum(obj)+lambda1*R + objective <- c(objective, obj) + U <- obj/lambda1 + if(trace.it && (iter%%10 == 0) ){ + print(paste("iter ", iter)) + } + if(save){ + save(S, file=paste(file, "/S_", iter,".Rdata", sep="")) + save(L, file=paste(file, "/L_", iter,".Rdata", sep="")) + save(objective, file=paste(file, "/objective_", iter,".Rdata", sep="")) + } + + } + stopCluster(cl) + return(list(A = A, L = L, S = S, objective = objective, R = R)) +} + +globalVariables(c("i")) \ No newline at end of file diff --git a/inst/doc/PrimarySchool.R b/inst/doc/PrimarySchool.R index b48a578..f08c35f 100644 --- a/inst/doc/PrimarySchool.R +++ b/inst/doc/PrimarySchool.R @@ -9,7 +9,108 @@ library(Matrix) library(gsbm) library(igraph) library(RColorBrewer) +library(combinat) data(PrimarySchool) -summary(PrimarySchool) +A <- PrimarySchool$A +class <- PrimarySchool$class +n <- dim(A)[1] +class.names <- levels(class) +class.effectifs <- table(class) + +## ----compute frequency of interaction----------------------------------------- +# Compute frequency of interactions +K = length(class.names) +Q = matrix(rep(0, K*K), ncol = K, nrow = K) +for (k1 in 1:K){ + com1 = class.names[k1] + for (k2 in 1:K){ + com2 = class.names[k2] + Q[k1, k2] <- mean(A[class == com1, class == com2], na.rm = TRUE) + Q[k2, k1] <- Q[k1, k2] + } +} +Q <- round(Q, 2) +colnames(Q) <- class.names +rownames(Q) <- class.names +print(Q) + +## ----Plot network------------------------------------------------------------- +# Vertex labels +v.label <- rep(NA, n) + +# Graph without NA's +A.noNA <- matrix(0, ncol = n, nrow = n) +A.noNA[!is.na(A)] <- A[!is.na(A)] +g <- graph_from_adjacency_matrix(A.noNA, mode = "undirected") + +# Layout +W<- matrix(0, n, n) +for (i in 1:n){ + for (j in 1:n){ + if ((!is.na(A[i,j])) && A[i,j]==1){ + if (class[i]==class[j] && class[i]!= "teachers"){ + W[i,j] <- 2 + } + else{ + W[i,j] <- 1 + } + } + } +} +gw <- graph_from_adjacency_matrix(W, mode = "undirected") +lay <- layout_with_fr(gw) +pal <- brewer.pal(11, "Paired") + +# Plot network +plot(g, vertex.label = v.label, vertex.size = (2 + colSums(A, na.rm =T)/5), vertex.color = class, layout = lay, palette = pal, edge.width = 0.3, edge.curved=0.1) + +legend("bottomleft", legend = class.names, col = pal, pch = 1, bty = 'n', cex = 0.5, ncol = 1) + +## ----run GSBM----------------------------------------------------------------- +# Investigate these outliers +T1<-Sys.time() +res <- gsbm_mcgd(A, 8.5, 8, maxit = 100) +T2<-Sys.time() +Tdiff= difftime(T1, T2) +print(paste0("running time : ", Tdiff)) +outliers <- which(colSums(res$S)>0) +no <- length(outliers) +outliers.class <- class[outliers] +outliers.Q <- matrix(0,nrow = no, ncol = K) +for (o in 1:no){ + for (k in 1:K){ + comk = class.names[k] + outliers.Q[o,k] <- mean(A[outliers[o],class == comk], na.rm = T) + } + print(paste0("node ", outliers[o], " is in class ", outliers.class[o])) + print(outliers.Q[o,], 2) + print("") +} + +## ----recover the classes------------------------------------------------------ +# Compute svd and run k-means +L.U <- svd(res$L[-outliers, -outliers], nu = 10)$u +est_class <- kmeans(L.U, 10, nstart = 100)$cluster +length(est_class) + +# Recover the class among the permutations of labels +equ_label = combinat::permn(1:10) +error <- rep(0, length(equ_label)) +for (i in 1:length(equ_label)){ + error[i] <- sum(class[-outliers] != (class.names[unlist(equ_label[i])])[est_class]) +} +error[which.min(error)] +class_est <- class.names[unlist(equ_label[which.min(error)])][est_class] +class_est_out <- rep(NA, n) +class_est_out[outliers] <- "outlier" +class_est_out[-outliers] <- class_est +class_est_out <- as.factor(class_est_out) + +# Plot network +plot(g, vertex.label = v.label, vertex.size = (2 + colSums(A, na.rm =T)/5), vertex.color = class_est_out, layout = lay, palette = pal, edge.width = 0.3, edge.curved=0.1) + +class_est_out <- as.factor(class_est_out) + + diff --git a/inst/doc/PrimarySchool.Rmd b/inst/doc/PrimarySchool.Rmd index 77e2bbf..b82e505 100644 --- a/inst/doc/PrimarySchool.Rmd +++ b/inst/doc/PrimarySchool.Rmd @@ -22,7 +22,124 @@ library(Matrix) library(gsbm) library(igraph) library(RColorBrewer) +library(combinat) data(PrimarySchool) -summary(PrimarySchool) +A <- PrimarySchool$A +class <- PrimarySchool$class +n <- dim(A)[1] +class.names <- levels(class) +class.effectifs <- table(class) ``` + +The duration of these interactions varies between $20$ seconds and two and a half hours. We consider that an physical interaction has actually if the corresponding contact time is greater than one minute. If an interaction of less than one minute is observed, we assume that this observation may be erroneous, and treat the corresponding data as missing. We thus obtain an $236 \times 236$ adjacency matrix with $7054$ missing entries (including $236$ diagonal entries), and $4980$ entries equal to $1$ (corresponding to $2490$ undirected edges). This graph presents strong communities structures, as pupils essentially interact with other pupils of the same class. This phenomenon is exemplified in the following table, which presents the frequency of interaction between individuals according to their respective class. + +```{r compute frequency of interaction} +# Compute frequency of interactions +K = length(class.names) +Q = matrix(rep(0, K*K), ncol = K, nrow = K) +for (k1 in 1:K){ + com1 = class.names[k1] + for (k2 in 1:K){ + com2 = class.names[k2] + Q[k1, k2] <- mean(A[class == com1, class == com2], na.rm = TRUE) + Q[k2, k1] <- Q[k1, k2] + } +} +Q <- round(Q, 2) +colnames(Q) <- class.names +rownames(Q) <- class.names +print(Q) +``` + +Then, we visualize the network. + +```{r Plot network} +# Vertex labels +v.label <- rep(NA, n) + +# Graph without NA's +A.noNA <- matrix(0, ncol = n, nrow = n) +A.noNA[!is.na(A)] <- A[!is.na(A)] +g <- graph_from_adjacency_matrix(A.noNA, mode = "undirected") + +# Layout +W<- matrix(0, n, n) +for (i in 1:n){ + for (j in 1:n){ + if ((!is.na(A[i,j])) && A[i,j]==1){ + if (class[i]==class[j] && class[i]!= "teachers"){ + W[i,j] <- 2 + } + else{ + W[i,j] <- 1 + } + } + } +} +gw <- graph_from_adjacency_matrix(W, mode = "undirected") +lay <- layout_with_fr(gw) +pal <- brewer.pal(11, "Paired") + +# Plot network +plot(g, vertex.label = v.label, vertex.size = (2 + colSums(A, na.rm =T)/5), vertex.color = class, layout = lay, palette = pal, edge.width = 0.3, edge.curved=0.1) + +legend("bottomleft", legend = class.names, col = pal, pch = 1, bty = 'n', cex = 0.5, ncol = 1) +``` + +The MCGD algorithm allows us to detect individuals with abnormal connectivities. When increasing $\lambda_2$, or when decreasing $\lambda_1$, we decrease the number of nodes that are identified as outliers. However, we note that the set of nodes detected as outliers is stable for parameters values around $(8.5,8)$. Moreover, those five nodes are identified as outliers significantly more often than the other nodes when the parameters vary. In the folloing, we therefore use the penalisation $(\lambda_1, \lambda_2) = (8.5,8)$. + +```{r run GSBM} +# Investigate these outliers +T1<-Sys.time() +res <- gsbm_mcgd(A, 8.5, 8, maxit = 100) +T2<-Sys.time() +Tdiff= difftime(T1, T2) +print(paste0("running time : ", Tdiff)) +outliers <- which(colSums(res$S)>0) +no <- length(outliers) +outliers.class <- class[outliers] +outliers.Q <- matrix(0,nrow = no, ncol = K) +for (o in 1:no){ + for (k in 1:K){ + comk = class.names[k] + outliers.Q[o,k] <- mean(A[outliers[o],class == comk], na.rm = T) + } + print(paste0("node ", outliers[o], " is in class ", outliers.class[o])) + print(outliers.Q[o,], 2) + print("") +} +``` + +We investigate the connectivity pattern of these nodes by comparing their connections with different groups to that of other children from their class. We note that children classified as outliers are overall well connected with children from other classes. In a epidemiologic context, they can be considered as super-propagators, who may spread a desease from one group to others. + +Finally, we show that one can recover the class structure from our estimator $\widehat{\mathbf{L}}$. Indeed, it is approximately of rank $10$, corresponding to the structure of $10$ classes present in the network (the teachers are densely connected with pupils from their class, and scarsely connected with one another, so their community cannot be recovered from the graph). We recover the classes by using a k-means algorithm on the rows of the matrix $\mathbf{U}$, where $\mathbf{U}$ is a matrix whose columns contain the $10$ left singular vectors of $\widehat{\mathbf{L}}$. + +```{r recover the classes} +# Compute svd and run k-means +L.U <- svd(res$L[-outliers, -outliers], nu = 10)$u +est_class <- kmeans(L.U, 10, nstart = 100)$cluster +length(est_class) + +# Recover the class among the permutations of labels +equ_label = combinat::permn(1:10) +error <- rep(0, length(equ_label)) +for (i in 1:length(equ_label)){ + error[i] <- sum(class[-outliers] != (class.names[unlist(equ_label[i])])[est_class]) +} +error[which.min(error)] +class_est <- class.names[unlist(equ_label[which.min(error)])][est_class] +class_est_out <- rep(NA, n) +class_est_out[outliers] <- "outlier" +class_est_out[-outliers] <- class_est +class_est_out <- as.factor(class_est_out) + +# Plot network +plot(g, vertex.label = v.label, vertex.size = (2 + colSums(A, na.rm =T)/5), vertex.color = class_est_out, layout = lay, palette = pal, edge.width = 0.3, edge.curved=0.1) + +class_est_out <- as.factor(class_est_out) + + +``` + +We observe that our method recovers perfectly the class of the children (but for the outliers, who are not classified). Although it cannot classify the teachers as such, we note that it provides a one-to-one mapping between teachers an classes, indicating that it is able to attribute each teacher to her class. \ No newline at end of file diff --git a/inst/doc/PrimarySchool.html b/inst/doc/PrimarySchool.html index de5ceb5..9845426 100644 --- a/inst/doc/PrimarySchool.html +++ b/inst/doc/PrimarySchool.html @@ -330,12 +330,143 @@

The network of contacts in a primary school

#> #> union library(RColorBrewer) - -data(PrimarySchool) -summary(PrimarySchool) -#> Length Class Mode -#> A 55696 -none- numeric -#> class 236 factor numeric +library(combinat) +#> +#> Attaching package: 'combinat' +#> The following object is masked from 'package:utils': +#> +#> combn + +data(PrimarySchool) +A <- PrimarySchool$A +class <- PrimarySchool$class +n <- dim(A)[1] +class.names <- levels(class) +class.effectifs <- table(class) +

The duration of these interactions varies between \(20\) seconds and two and a half hours. We consider that an physical interaction has actually if the corresponding contact time is greater than one minute. If an interaction of less than one minute is observed, we assume that this observation may be erroneous, and treat the corresponding data as missing. We thus obtain an \(236 \times 236\) adjacency matrix with \(7054\) missing entries (including \(236\) diagonal entries), and \(4980\) entries equal to \(1\) (corresponding to \(2490\) undirected edges). This graph presents strong communities structures, as pupils essentially interact with other pupils of the same class. This phenomenon is exemplified in the following table, which presents the frequency of interaction between individuals according to their respective class.

+
# Compute frequency of interactions
+K = length(class.names)
+Q = matrix(rep(0, K*K), ncol = K, nrow = K)
+for (k1 in 1:K){
+  com1 = class.names[k1]
+  for (k2 in 1:K){
+    com2 = class.names[k2]
+    Q[k1, k2] <- mean(A[class == com1, class == com2], na.rm = TRUE)
+    Q[k2, k1] <- Q[k1, k2]
+  }
+}
+Q <- round(Q, 2)
+colnames(Q) <- class.names
+rownames(Q) <- class.names
+print(Q)
+#>          ce1a ce1b ce2a ce2b cm1a cm1b cm2a cm2b  cpa  cpb teachers
+#> ce1a     0.68 0.07 0.07 0.01 0.00 0.00 0.00 0.00 0.03 0.03     0.10
+#> ce1b     0.07 0.83 0.02 0.00 0.01 0.00 0.00 0.00 0.05 0.08     0.11
+#> ce2a     0.07 0.02 0.90 0.21 0.00 0.02 0.00 0.01 0.03 0.04     0.10
+#> ce2b     0.01 0.00 0.21 0.94 0.05 0.03 0.00 0.03 0.02 0.00     0.13
+#> cm1a     0.00 0.01 0.00 0.05 0.97 0.08 0.02 0.07 0.00 0.01     0.09
+#> cm1b     0.00 0.00 0.02 0.03 0.08 0.88 0.01 0.03 0.02 0.01     0.05
+#> cm2a     0.00 0.00 0.00 0.00 0.02 0.01 0.88 0.29 0.00 0.00     0.08
+#> cm2b     0.00 0.00 0.01 0.03 0.07 0.03 0.29 0.93 0.00 0.00     0.08
+#> cpa      0.03 0.05 0.03 0.02 0.00 0.02 0.00 0.00 0.97 0.15     0.04
+#> cpb      0.03 0.08 0.04 0.00 0.01 0.01 0.00 0.00 0.15 0.97     0.09
+#> teachers 0.10 0.11 0.10 0.13 0.09 0.05 0.08 0.08 0.04 0.09     0.29
+

Then, we visualize the network.

+
# Vertex labels
+v.label <- rep(NA, n)
+
+# Graph without NA's
+A.noNA <- matrix(0, ncol = n, nrow = n)
+A.noNA[!is.na(A)] <- A[!is.na(A)]
+g <- graph_from_adjacency_matrix(A.noNA, mode = "undirected")
+
+# Layout
+W<- matrix(0, n, n)
+for (i in 1:n){
+  for (j in 1:n){
+      if ((!is.na(A[i,j])) && A[i,j]==1){
+        if (class[i]==class[j] && class[i]!= "teachers"){
+          W[i,j] <- 2
+        }
+        else{
+          W[i,j] <- 1
+        }
+      }
+  }
+}
+gw <- graph_from_adjacency_matrix(W, mode = "undirected")
+lay <- layout_with_fr(gw)
+pal <- brewer.pal(11, "Paired")
+
+# Plot network
+plot(g, vertex.label = v.label, vertex.size = (2 + colSums(A, na.rm =T)/5), vertex.color = class, layout = lay, palette = pal, edge.width = 0.3, edge.curved=0.1)
+
+legend("bottomleft", legend = class.names, col = pal, pch = 1, bty = 'n', cex = 0.5, ncol = 1)
+

+

The MCGD algorithm allows us to detect individuals with abnormal connectivities. When increasing \(\lambda_2\), or when decreasing \(\lambda_1\), we decrease the number of nodes that are identified as outliers. However, we note that the set of nodes detected as outliers is stable for parameters values around \((8.5,8)\). Moreover, those five nodes are identified as outliers significantly more often than the other nodes when the parameters vary. In the folloing, we therefore use the penalisation \((\lambda_1, \lambda_2) = (8.5,8)\).

+
# Investigate these outliers
+T1<-Sys.time()
+res <- gsbm_mcgd(A, 8.5, 8, maxit = 100)
+T2<-Sys.time()
+Tdiff= difftime(T1, T2)
+print(paste0("running time : ", Tdiff))
+#> [1] "running time : -1.65716505050659"
+outliers <- which(colSums(res$S)>0)
+no <- length(outliers)
+outliers.class <- class[outliers]
+outliers.Q <- matrix(0,nrow = no, ncol = K)
+for (o in 1:no){
+  for (k in 1:K){
+    comk = class.names[k]
+    outliers.Q[o,k] <- mean(A[outliers[o],class == comk], na.rm = T)
+  }
+  print(paste0("node ", outliers[o], " is in class ", outliers.class[o]))
+  print(outliers.Q[o,], 2)
+  print("")
+}
+#> [1] "node 15 is in class ce2b"
+#>  [1] 0.000 0.000 0.556 1.000 0.333 0.091 0.000 0.118 0.000 0.000 0.100
+#> [1] ""
+#> [1] "node 30 is in class ce1a"
+#>  [1] 0.929 0.300 0.227 0.077 0.000 0.000 0.048 0.000 0.125 0.150 0.111
+#> [1] ""
+#> [1] "node 35 is in class ce1a"
+#>  [1] 0.625 0.238 0.350 0.000 0.059 0.000 0.000 0.050 0.133 0.222 0.111
+#> [1] ""
+#> [1] "node 94 is in class ce1b"
+#>  [1] 0.105 0.955 0.056 0.000 0.095 0.000 0.000 0.000 0.222 0.375 0.100
+#> [1] ""
+#> [1] "node 180 is in class ce1a"
+#>  [1] 1.00 0.43 0.28 0.00 0.00 0.00 0.00 0.00 0.17 0.12 0.12
+#> [1] ""
+

We investigate the connectivity pattern of these nodes by comparing their connections with different groups to that of other children from their class. We note that children classified as outliers are overall well connected with children from other classes. In a epidemiologic context, they can be considered as super-propagators, who may spread a desease from one group to others.

+

Finally, we show that one can recover the class structure from our estimator \(\widehat{\mathbf{L}}\). Indeed, it is approximately of rank \(10\), corresponding to the structure of \(10\) classes present in the network (the teachers are densely connected with pupils from their class, and scarsely connected with one another, so their community cannot be recovered from the graph). We recover the classes by using a k-means algorithm on the rows of the matrix \(\mathbf{U}\), where \(\mathbf{U}\) is a matrix whose columns contain the \(10\) left singular vectors of \(\widehat{\mathbf{L}}\).

+
# Compute svd and run k-means
+L.U <- svd(res$L[-outliers, -outliers], nu = 10)$u 
+est_class <- kmeans(L.U, 10, nstart = 100)$cluster
+length(est_class)
+#> [1] 231
+
+# Recover the class among the permutations of labels
+equ_label = combinat::permn(1:10)
+error <- rep(0, length(equ_label))
+for (i in 1:length(equ_label)){
+  error[i] <- sum(class[-outliers] != (class.names[unlist(equ_label[i])])[est_class])
+}
+error[which.min(error)]
+#> [1] 10
+class_est <- class.names[unlist(equ_label[which.min(error)])][est_class]
+class_est_out <- rep(NA, n)
+class_est_out[outliers] <- "outlier"
+class_est_out[-outliers] <- class_est
+class_est_out <- as.factor(class_est_out)
+
+# Plot network
+plot(g, vertex.label = v.label, vertex.size = (2 + colSums(A, na.rm =T)/5), vertex.color = class_est_out, layout = lay, palette = pal, edge.width = 0.3, edge.curved=0.1)
+

+

+class_est_out <- as.factor(class_est_out)
+

We observe that our method recovers perfectly the class of the children (but for the outliers, who are not classified). Although it cannot classify the teachers as such, we note that it provides a one-to-one mapping between teachers an classes, indicating that it is able to attribute each teacher to her class.

diff --git a/inst/doc/les_miserables.R b/inst/doc/les_miserables.R index 21cbada..ddbe130 100644 --- a/inst/doc/les_miserables.R +++ b/inst/doc/les_miserables.R @@ -23,9 +23,8 @@ plot(net, vertex.label.cex = 0.4) ## ----classical_SBM------------------------------------------------------------ vBlocks <- 1:10 -collection_sbm <- missSBM::estimate(prepare_data(A), vBlocks = vBlocks, sampling = "node") -L_missSBM <- collection_sbm$bestModel$fittedSBM$connectProb -colo <- round(collection_sbm$bestModel$fittedSBM$blocks) +collection_sbm <- missSBM::estimateMissSBM(A, vBlocks, "node") +colo <- round(collection_sbm$bestModel$fittedSBM$probMemberships) colo <- sapply(1:nrow(A), function(i) which.max(colo[i,])) pal3 <- brewer.pal(10, "Set3") V(net)$color <- pal3[colo] diff --git a/inst/doc/les_miserables.Rmd b/inst/doc/les_miserables.Rmd index 719d6f1..cfcbfe4 100644 --- a/inst/doc/les_miserables.Rmd +++ b/inst/doc/les_miserables.Rmd @@ -41,9 +41,8 @@ We fit a classical SBM to the graph and represent the graph with nodes proportio ```{r classical_SBM} vBlocks <- 1:10 -collection_sbm <- missSBM::estimate(prepare_data(A), vBlocks = vBlocks, sampling = "node") -L_missSBM <- collection_sbm$bestModel$fittedSBM$connectProb -colo <- round(collection_sbm$bestModel$fittedSBM$blocks) +collection_sbm <- missSBM::estimateMissSBM(A, vBlocks, "node") +colo <- round(collection_sbm$bestModel$fittedSBM$probMemberships) colo <- sapply(1:nrow(A), function(i) which.max(colo[i,])) pal3 <- brewer.pal(10, "Set3") V(net)$color <- pal3[colo] diff --git a/inst/doc/les_miserables.html b/inst/doc/les_miserables.html index 6d6849b..36fd6ca 100644 --- a/inst/doc/les_miserables.html +++ b/inst/doc/les_miserables.html @@ -318,15 +318,16 @@

Les miserables

library(igraph)
 library(gsbm)
 library(missSBM)
-#> 
-#> Attaching package: 'missSBM'
-#> The following objects are masked from 'package:stats':
-#> 
-#>     simulate, smooth
-#> The following object is masked from 'package:base':
-#> 
-#>     sample
-library(RColorBrewer)
+#> Warning: package 'missSBM' was built under R version 3.6.2 +#> +#> Attaching package: 'missSBM' +#> The following objects are masked from 'package:stats': +#> +#> simulate, smooth +#> The following object is masked from 'package:base': +#> +#> sample +library(RColorBrewer)

Les Misérables character network

Les Misérables characters network, encoding interactions between characters of Victor Hugo’s novel, was first created by Donald Knuth as part of the Stanford Graph Base (https://people.sc.fsu.edu/~jburkardt/datasets/sgb/sgb.html). It contains 77 nodes corresponding to characters of the novel, and 254 vertices connecting two characters whenever they appear in the same chapter.

@@ -339,10 +340,10 @@

Les Misérables character network

deg <- degree(net, mode="all") V(net)$size <- deg plot(net, vertex.label.cex = 0.4)
-

+

We fit a classical SBM to the graph and represent the graph with nodes proportional to their degrees and colored by community assignment. The number of communities has been selected so as to minimize the ICL criterion.

vBlocks <- 1:10
-collection_sbm <- missSBM::estimate(prepare_data(A), vBlocks = vBlocks, sampling = "node")
+collection_sbm <- missSBM::estimateMissSBM(A, vBlocks, "node")
 #> 
 #> 
 #>  Adjusting Variational EM for Stochastic Block Model
@@ -369,15 +370,14 @@ 

Les Misérables character network

Performing VEM inference for model with 8 blocks. Performing VEM inference for model with 9 blocks. Performing VEM inference for model with 10 blocks. -L_missSBM <- collection_sbm$bestModel$fittedSBM$connectProb -colo <- round(collection_sbm$bestModel$fittedSBM$blocks) -colo <- sapply(1:nrow(A), function(i) which.max(colo[i,])) -pal3 <- brewer.pal(10, "Set3") -V(net)$color <- pal3[colo] -V(net)$label <- NA -V(net)$size <- deg -plot(net)
-

+colo <- round(collection_sbm$bestModel$fittedSBM$probMemberships) +colo <- sapply(1:nrow(A), function(i) which.max(colo[i,])) +pal3 <- brewer.pal(10, "Set3") +V(net)$color <- pal3[colo] +V(net)$label <- NA +V(net)$size <- deg +plot(net) +

We observe that the main character Jean Valjean is alone in his community, and one of the clusters groups important characters (Thénardier, Éponine, Javert).

The Generalized stochastic Block Model accounts for outlier profiles (hubs, mixed memberships). In this model, nodes are divided into two sets: the inliers which follow a classical SBM, and the outliers, for which we make no assumptions on the connectivity model. These two sets are unknown a priori and are learned automatically by our procedure. Below we represent the result of the clustering, with the detected outliers indicated in red. They correspond to hubs (large center node, Jean Valjean) and nodes with mixed memberships (e.g. smaller central nodes with connections to several clusters).

lambda1 <- 4
@@ -391,41 +391,41 @@ 

Les Misérables character network

com <- kmeans(pc, centers=4, nstart=50) com$cluster #> Napoleon MlleBaptistine MmeMagloire CountessDeLo -#> 4 4 4 4 +#> 3 3 3 3 #> Geborand Champtercier Cravatte Count -#> 4 4 4 4 +#> 3 3 3 3 #> OldMan Labarre Marguerite MmeDeR -#> 4 4 4 4 +#> 3 3 3 3 #> Isabeau Gervais Tholomyes Listolier -#> 4 4 1 1 +#> 3 3 4 4 #> Fameuil Blacheville Favourite Dahlia -#> 1 1 1 1 +#> 4 4 4 4 #> Zephine MmeThenardier Fauchelevent Bamatabois -#> 1 4 4 3 +#> 4 3 3 2 #> Perpetue Simplice Scaufflaire Woman1 -#> 4 4 4 4 +#> 3 3 3 3 #> Judge Champmathieu Brevet Chenildieu -#> 3 3 3 3 +#> 2 2 2 2 #> Cochepaille Pontmercy Boulatruelle Eponine -#> 3 4 4 4 +#> 2 3 3 3 #> Anzelma Woman2 MotherInnocent Gribier -#> 4 4 4 4 +#> 3 3 3 3 #> Jondrette MmeBurgon Gillenormand Magnon -#> 4 4 4 4 +#> 3 3 3 3 #> MlleGillenormand MmePontmercy MlleVaubois LtGillenormand -#> 4 4 4 4 +#> 3 3 3 3 #> BaronessT Mabeuf Enjolras Combeferre -#> 4 2 2 2 +#> 3 1 1 1 #> Prouvaire Feuilly Courfeyrac Bahorel -#> 2 2 2 2 +#> 1 1 1 1 #> Bossuet Joly Grantaire MotherPlutarch -#> 2 2 2 4 +#> 1 1 1 3 #> Gueulemer Babet Claquesous Montparnasse -#> 4 4 4 4 +#> 3 3 3 3 #> Toussaint Child1 Child2 Brujon -#> 4 4 4 4 +#> 3 3 3 3 #> MmeHucheloup -#> 2 +#> 1 colo2 <- 1:nrow(A) names(colo2) <- names comu <- com$cluster @@ -440,7 +440,7 @@

Les Misérables character network

V(net)$size <- deg E(net)$arrow.size <- 5 plot(net, vertex.label.dist=20)
-

+

diff --git a/man/gsbm_mcgd_parallel.Rd b/man/gsbm_mcgd_parallel.Rd new file mode 100644 index 0000000..10d9343 --- /dev/null +++ b/man/gsbm_mcgd_parallel.Rd @@ -0,0 +1,68 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/gsbm_mcgd_parallel.R +\name{gsbm_mcgd_parallel} +\alias{gsbm_mcgd_parallel} +\title{Fit a Generalized Stochastic Block Model} +\usage{ +gsbm_mcgd_parallel( + A, + lambda1, + lambda2, + epsilon = 0.1, + maxit = 100, + step_L = 0.01, + step_S = 0.1, + trace.it = FALSE, + n_cores = detectCores(), + save = FALSE, + file = NULL +) +} +\arguments{ +\item{A}{nxn adjacency matrix} + +\item{lambda1}{regularization parameter for nuclear norm penalty (positive number)} + +\item{lambda2}{regularization parameter for 2,1-norm penalty (positive number)} + +\item{epsilon}{regularization parameter for the L2-norm penalty (positive number, if NULL, default method is applied)} + +\item{maxit}{maximum number of iterations (positive integer, if NULL, default method is applied)} + +\item{step_L}{step size for the gradient step of L parameter (positive number)} + +\item{step_S}{step size for the gradient step of S parameter (positive number)} + +\item{trace.it}{whether messages about convergence should be printed (boolean, if NULL, default is FALSE)} + +\item{n_cores}{number of cores to parallellize on (integer number, default is set with detectCores())} + +\item{save}{whether or not value of current estimates should be saved at each iteration (boolean)} + +\item{file}{if save is set to TRUE, name of the folder where current estimates should be saved (character string, file saved in file/L_iter.txt at iteration iter)} +} +\value{ +The estimate for the nxn matrix of probabilities of connections between nodes. It is +given as the sum of a low-rank nxn matrix L, corresponding to connections between inlier +nodes, and a column sparse nxn matrix S, corresponding to connections between outlier +nodes and the rest of the network. The matrices L and S are such that + +E(A) = L - diag(L) + S + S' + +where E(A) is the expectation of the adjacency matrix, diag(L) is a nxn diagonal +matrix with diagonal entries equal to those of L, and S' means S transposed. + +The return value is a list of components + \itemize{ + \item{\code{A}}{ the adjacency matrix.} + \item{\code{L}}{ estimate for the low-rank component.} + \item{\code{S}}{ estimate for the column-sparse component.} + \item{\code{objective}}{ the value of the objective function.} + \item{\code{R}}{ a bound on the nuclear norm of the low-rank component.} + \item{\code{iter}}{ number of iterations between convergence of the objective function.} + } +} +\description{ +Given an adjacency matrix with missing observations, the function \code{gsbm_mgcd} +robustly estimates the probabilities of connections between nodes. +} diff --git a/vignettes/PrimarySchool.Rmd b/vignettes/PrimarySchool.Rmd index 77e2bbf..b82e505 100644 --- a/vignettes/PrimarySchool.Rmd +++ b/vignettes/PrimarySchool.Rmd @@ -22,7 +22,124 @@ library(Matrix) library(gsbm) library(igraph) library(RColorBrewer) +library(combinat) data(PrimarySchool) -summary(PrimarySchool) +A <- PrimarySchool$A +class <- PrimarySchool$class +n <- dim(A)[1] +class.names <- levels(class) +class.effectifs <- table(class) ``` + +The duration of these interactions varies between $20$ seconds and two and a half hours. We consider that an physical interaction has actually if the corresponding contact time is greater than one minute. If an interaction of less than one minute is observed, we assume that this observation may be erroneous, and treat the corresponding data as missing. We thus obtain an $236 \times 236$ adjacency matrix with $7054$ missing entries (including $236$ diagonal entries), and $4980$ entries equal to $1$ (corresponding to $2490$ undirected edges). This graph presents strong communities structures, as pupils essentially interact with other pupils of the same class. This phenomenon is exemplified in the following table, which presents the frequency of interaction between individuals according to their respective class. + +```{r compute frequency of interaction} +# Compute frequency of interactions +K = length(class.names) +Q = matrix(rep(0, K*K), ncol = K, nrow = K) +for (k1 in 1:K){ + com1 = class.names[k1] + for (k2 in 1:K){ + com2 = class.names[k2] + Q[k1, k2] <- mean(A[class == com1, class == com2], na.rm = TRUE) + Q[k2, k1] <- Q[k1, k2] + } +} +Q <- round(Q, 2) +colnames(Q) <- class.names +rownames(Q) <- class.names +print(Q) +``` + +Then, we visualize the network. + +```{r Plot network} +# Vertex labels +v.label <- rep(NA, n) + +# Graph without NA's +A.noNA <- matrix(0, ncol = n, nrow = n) +A.noNA[!is.na(A)] <- A[!is.na(A)] +g <- graph_from_adjacency_matrix(A.noNA, mode = "undirected") + +# Layout +W<- matrix(0, n, n) +for (i in 1:n){ + for (j in 1:n){ + if ((!is.na(A[i,j])) && A[i,j]==1){ + if (class[i]==class[j] && class[i]!= "teachers"){ + W[i,j] <- 2 + } + else{ + W[i,j] <- 1 + } + } + } +} +gw <- graph_from_adjacency_matrix(W, mode = "undirected") +lay <- layout_with_fr(gw) +pal <- brewer.pal(11, "Paired") + +# Plot network +plot(g, vertex.label = v.label, vertex.size = (2 + colSums(A, na.rm =T)/5), vertex.color = class, layout = lay, palette = pal, edge.width = 0.3, edge.curved=0.1) + +legend("bottomleft", legend = class.names, col = pal, pch = 1, bty = 'n', cex = 0.5, ncol = 1) +``` + +The MCGD algorithm allows us to detect individuals with abnormal connectivities. When increasing $\lambda_2$, or when decreasing $\lambda_1$, we decrease the number of nodes that are identified as outliers. However, we note that the set of nodes detected as outliers is stable for parameters values around $(8.5,8)$. Moreover, those five nodes are identified as outliers significantly more often than the other nodes when the parameters vary. In the folloing, we therefore use the penalisation $(\lambda_1, \lambda_2) = (8.5,8)$. + +```{r run GSBM} +# Investigate these outliers +T1<-Sys.time() +res <- gsbm_mcgd(A, 8.5, 8, maxit = 100) +T2<-Sys.time() +Tdiff= difftime(T1, T2) +print(paste0("running time : ", Tdiff)) +outliers <- which(colSums(res$S)>0) +no <- length(outliers) +outliers.class <- class[outliers] +outliers.Q <- matrix(0,nrow = no, ncol = K) +for (o in 1:no){ + for (k in 1:K){ + comk = class.names[k] + outliers.Q[o,k] <- mean(A[outliers[o],class == comk], na.rm = T) + } + print(paste0("node ", outliers[o], " is in class ", outliers.class[o])) + print(outliers.Q[o,], 2) + print("") +} +``` + +We investigate the connectivity pattern of these nodes by comparing their connections with different groups to that of other children from their class. We note that children classified as outliers are overall well connected with children from other classes. In a epidemiologic context, they can be considered as super-propagators, who may spread a desease from one group to others. + +Finally, we show that one can recover the class structure from our estimator $\widehat{\mathbf{L}}$. Indeed, it is approximately of rank $10$, corresponding to the structure of $10$ classes present in the network (the teachers are densely connected with pupils from their class, and scarsely connected with one another, so their community cannot be recovered from the graph). We recover the classes by using a k-means algorithm on the rows of the matrix $\mathbf{U}$, where $\mathbf{U}$ is a matrix whose columns contain the $10$ left singular vectors of $\widehat{\mathbf{L}}$. + +```{r recover the classes} +# Compute svd and run k-means +L.U <- svd(res$L[-outliers, -outliers], nu = 10)$u +est_class <- kmeans(L.U, 10, nstart = 100)$cluster +length(est_class) + +# Recover the class among the permutations of labels +equ_label = combinat::permn(1:10) +error <- rep(0, length(equ_label)) +for (i in 1:length(equ_label)){ + error[i] <- sum(class[-outliers] != (class.names[unlist(equ_label[i])])[est_class]) +} +error[which.min(error)] +class_est <- class.names[unlist(equ_label[which.min(error)])][est_class] +class_est_out <- rep(NA, n) +class_est_out[outliers] <- "outlier" +class_est_out[-outliers] <- class_est +class_est_out <- as.factor(class_est_out) + +# Plot network +plot(g, vertex.label = v.label, vertex.size = (2 + colSums(A, na.rm =T)/5), vertex.color = class_est_out, layout = lay, palette = pal, edge.width = 0.3, edge.curved=0.1) + +class_est_out <- as.factor(class_est_out) + + +``` + +We observe that our method recovers perfectly the class of the children (but for the outliers, who are not classified). Although it cannot classify the teachers as such, we note that it provides a one-to-one mapping between teachers an classes, indicating that it is able to attribute each teacher to her class. \ No newline at end of file diff --git a/vignettes/les_miserables.Rmd b/vignettes/les_miserables.Rmd index 719d6f1..cfcbfe4 100644 --- a/vignettes/les_miserables.Rmd +++ b/vignettes/les_miserables.Rmd @@ -41,9 +41,8 @@ We fit a classical SBM to the graph and represent the graph with nodes proportio ```{r classical_SBM} vBlocks <- 1:10 -collection_sbm <- missSBM::estimate(prepare_data(A), vBlocks = vBlocks, sampling = "node") -L_missSBM <- collection_sbm$bestModel$fittedSBM$connectProb -colo <- round(collection_sbm$bestModel$fittedSBM$blocks) +collection_sbm <- missSBM::estimateMissSBM(A, vBlocks, "node") +colo <- round(collection_sbm$bestModel$fittedSBM$probMemberships) colo <- sapply(1:nrow(A), function(i) which.max(colo[i,])) pal3 <- brewer.pal(10, "Set3") V(net)$color <- pal3[colo]