version 1.0

cran · Nov 23, 2020 · 3d54f69 · 3d54f69
commit 3d54f69
Show file tree

Hide file tree

Showing 36 changed files with 1,984 additions and 0 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,23 @@
+Package: M2SMJF
+Title: Multi-Modal Similarity Matrix Joint Factorization
+Version: 1.0
+Authors@R: person("Xiaoyao", "Yin", email = "yinxy1992@sina.com", role = c("aut", "cre"))
+Description: A new method to implement clustering from multiple modality data of certain samples, 
+    the function M2SMjF() jointly factorizes multiple similarity matrices into a shared sub-matrix 
+    and several modality private sub-matrices, which is further used for clustering. Along with 
+    this method, we also provide function to calculate the similarity matrix and function to 
+    evaluate the best cluster number from the original data.
+Imports: dplyr, MASS, stats
+Depends: R (>= 3.4.0)
+License: GPL (>= 2)
+Encoding: UTF-8
+LazyData: true
+RoxygenNote: 7.1.1
+Suggests: knitr, rmarkdown
+VignetteBuilder: knitr
+NeedsCompilation: no
+Packaged: 2020-11-20 00:49:57 UTC; DELL
+Author: Xiaoyao Yin [aut, cre]
+Maintainer: Xiaoyao Yin <yinxy1992@sina.com>
+Repository: CRAN
+Date/Publication: 2020-11-23 08:40:06 UTC
diff --git a/MD5 b/MD5
@@ -0,0 +1,35 @@
+0c144947c22cb86d28a3e8019dfad1a8 *DESCRIPTION
+321dda5c58fc307aa9c82250e286269f *NAMESPACE
+8d8239d90fee1f0fdeff0d9dd7f3dd9a *R/Cal_NMI.R
+0f4d1e0f64648cf3560b5548e7a1fa8d *R/M2SMJF.R
+460ebd9b20101bdf93eddbfad7abb0d4 *R/Standard_Normalization.R
+48e45a970f0497430edd675c2ee7b3c5 *R/affinityMatrix.R
+cd0867a38b2f9591c471fd90dc7099b8 *R/cost.R
+2d6f15baa103c9f74d0f5dad0132f967 *R/dist2bin.R
+6a67c51711542a9ad6847739ccce0dab *R/dist2chi.R
+b10b4def4b5d466c0d356e4d3d4da943 *R/dist2eu.R
+ff51d39ce5eb9db7291961e72f2cc854 *R/initialization.R
+897a30e8b26b5dbb79236da735986176 *R/initialize_WL.R
+89664a65d91574f54eb15457e2c04144 *R/new_modularity.R
+1dbd5ddc2361ea82f336e34843564ddd *R/simu_data_gen.R
+1da7d8ca2b21337c375db9cdc932b358 *R/update_L.R
+8bb0a4aa64915d9d273812b7beb1c9c9 *R/update_alpha.R
+575a3975ffd3213ca4cbfc8c95e28c32 *build/vignette.rds
+3238ab2475e4eda0c7f0d9e3cd406daf *inst/doc/introduction.R
+8caf5d73bd739b0bbea1dd5da88421ac *inst/doc/introduction.Rmd
+ed87cb375c2a87afcd0e0ed198269e7f *inst/doc/introduction.html
+0ea9ff08b4dba6aaf12d555005691c19 *man/Cal_NMI.Rd
+c3eeb7120f0ba916c83f3232d65cf204 *man/M2SMJF.Rd
+7b3b5e7c8539b89951c63b8b58f2330f *man/Standard_Normalization.Rd
+114dc3134be0eef025fda4254db983bd *man/affinityMatrix.Rd
+edd535b7b028ad42e3cbfccd713d6d49 *man/cost.Rd
+081abdcd3ca4869da5565779db1fc0ca *man/dist2bin.Rd
+88910a2ab9c51553d55b3904c23e8b86 *man/dist2chi.Rd
+aa52dc00dd69beb490748c1d70a96ecf *man/dist2eu.Rd
+9b23641ab445a83d3749fc768cf26877 *man/initialization.Rd
+717c946279064b8237116e216d39bd0d *man/initialize_WL.Rd
+cb900189b8823ac0c286f2c4ea920b95 *man/new_modularity.Rd
+961dbfb0b6b602c1d1a23467de548843 *man/simu_data_gen.Rd
+4ddf67926edb14c885c072bc14b4787e *man/update_L.Rd
+9a9f0feb15b5c93d790bf6e07dd1ec68 *man/update_alpha.Rd
+8caf5d73bd739b0bbea1dd5da88421ac *vignettes/introduction.Rmd
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,20 @@
+# Generated by roxygen2: do not edit by hand
+
+export(Cal_NMI)
+export(M2SMJF)
+export(Standard_Normalization)
+export(affinityMatrix)
+export(cost)
+export(dist2bin)
+export(dist2chi)
+export(dist2eu)
+export(initialization)
+export(initialize_WL)
+export(new_modularity)
+export(simu_data_gen)
+export(update_L)
+export(update_alpha)
+import(dplyr)
+importFrom(MASS,ginv)
+importFrom(stats,kmeans)
+importFrom(stats,rnorm)
diff --git a/R/Cal_NMI.R b/R/Cal_NMI.R
@@ -0,0 +1,58 @@
+#' calculate the normalized mutual information.
+#' 
+#' calculate the normalized mutual information of two vectors x and y.
+#' 
+#' @param x A vector 
+#' 
+#' @param y A vector as long as x
+#' 
+#' @return A number between 0 and 1 indicating the normalized mutual information
+#' @author Xiaoyao Yin
+#' @examples 
+#' 
+#' x <- c(0.1,0.2,0.3,0.4)
+#' y <- c(0.1,0.2,0.3,0.4)
+#' NMI <- Cal_NMI(x,y)
+#' @rdname Cal_NMI
+#' @export
+Cal_NMI <- function(x, y) {
+	x = as.vector(x)
+	y = as.vector(y)
+    return(max(0, mutualInformation(x, y)/sqrt(entropy(x) * entropy(y)), na.rm=TRUE))
+}
+
+# Calculate the mutual information between vectors x and y.
+mutualInformation <- function(x, y) {
+  classx <- unique(x)
+  classy <- unique(y)
+  nx <- length(x)
+  ncx <- length(classx)
+  ncy <- length(classy)
+
+  probxy <- matrix(NA, ncx, ncy)
+  for (i in 1:ncx) {
+    for (j in 1:ncy) {
+      probxy[i, j] <- sum((x == classx[i]) & (y == classy[j])) / nx
+    }
+  }
+
+  probx <- matrix(rowSums(probxy), ncx, ncy)
+  proby <- matrix(colSums(probxy), ncx, ncy, byrow=TRUE)
+  result <- sum(probxy * log(probxy / (probx * proby), 2), na.rm=TRUE)
+  return(result)
+}
+
+# Calculate the entropy of vector x.
+entropy <- function(x) {
+  class <- unique(x)
+  nx <- length(x)
+  nc <- length(class)
+
+  prob <- rep.int(NA, nc)
+  for (i in 1:nc) {
+    prob[i] <- sum(x == class[i])/nx
+  }
+
+  result <- -sum(prob * log(prob, 2))
+  return(result)
+}
diff --git a/R/M2SMJF.R b/R/M2SMJF.R
@@ -0,0 +1,62 @@
+#' the main part for M2SMJF and clustering result
+#'
+#' jointly factorize multiple matrices into a shared sub-matrix and multiple private sub-matrices
+#'
+#' @param WL A list of similarity matrices
+#'
+#' @param lambda A parameter to set the relative weight of the group sparsity constraints
+#'
+#' @param theta A parameter to determine the convergence
+#'
+#' @param k A parameter to specify the cluster number
+#' @return A list containing the clustering result
+#' \item{sub_matrices}{a list containing all the sub-matrices}
+#' \item{cluster_res}{the clustering result which is as long as the number of samples}
+#' @author Xiaoyao Yin
+#' @examples
+#'
+#' WL <- simu_data_gen()
+#' res <- M2SMJF(WL,0.25,10^-4,5)
+#' @rdname M2SMJF
+#' @export
+#' @importFrom MASS ginv
+#' @import dplyr
+#' @importFrom stats kmeans rnorm
+M2SMJF <- function(WL,lambda=0.25,theta=10^-4,k){
+  if (!is.list(WL))
+  {
+    stop('Error:please provide a data list by WL')
+  }
+  if (is.na(k))
+  {
+    stop('Error:please provide a cluster number by k')
+  }
+  N <- length(WL)  ##length of the similarity matrices list
+  new_WL_list <- initialize_WL(WL)  ## factorize the each of the similairty matrix Si into Ci*t(Ci)
+  init_list <- initialization(new_WL_list,k)  ## initialize the sub-matrix Ci into alpha*Li
+  ### minimize the objective function until convergence
+  divergence <- 1 ##initialize the divergence
+  iter_num <- 1  ##initialize the iteration number
+  old_cost <- 0   ##initialize the cost
+  while (divergence>theta)  ##while not convergent
+  {
+    update_L_list <- update_L(new_WL_list,init_list)  ##update all the L with update_L
+    update_alpha_list <- update_alpha(new_WL_list,update_L_list,lambda)  ##update all the alpha with update_alpha
+    init_list <- update_alpha_list
+    new_cost <- cost(new_WL_list,init_list,lambda)  ## calculate the cost of objective function
+    divergence <- abs((new_cost-old_cost)/new_cost)  ## calculate the relative variation of cost
+    old_cost <- new_cost
+    iter_num <- iter_num+1
+  }
+  alpha <- init_list[[(N+1)]]  ## sub_matrix alpha is the last one in init_list
+  clu_res <- kmeans(alpha,k,1000,20)  ## clustering the samples by kmeans on the sub_matrix alpha, which is a conventional method in matrix factorization based clustering.
+  cluster_res <- clu_res$cluster  ## get the clustering result, the ith element indicates the cluster of the ith sample
+  result <- vector("list",2)
+  names(result) <- c("sub_matrices","cluster_res")
+  sub_result <- init_list
+  names(sub_result) <- c(paste("sub_matrix_L",c(1:N),sep='_'),'alpha')
+  result[[1]] <- sub_result
+  result[[2]] <- cluster_res
+  return(result)
+}
+
diff --git a/R/Standard_Normalization.R b/R/Standard_Normalization.R
@@ -0,0 +1,23 @@
+#' Normalize the input matrix by column
+#' 
+#' Normalize each column of x to have mean 0 and standard deviation 1.
+#' 
+#' @param x A sample-feature matrix with rows as samples and columns as features
+#' 
+#' @return A sample-feature matrix with rows as samples and columns as features,each column of the matrix have mean 0 and standard deviation 1
+#' @author Xiaoyao Yin
+#' @examples 
+#' 
+#' data_list <- simu_data_gen()
+#' x <- data_list[[1]]
+#' data_matrix <- Standard_Normalization(x)
+#' @rdname Standard_Normalization
+#' @export
+Standard_Normalization = function(x) {
+  x = as.matrix(x);
+  mean = apply(x, 2, mean)
+  sd = apply(x, 2, sd)
+  sd[sd==0] = 1
+  xNorm = t((t(x) - mean) / sd)
+  return(xNorm)
+}
diff --git a/R/affinityMatrix.R b/R/affinityMatrix.R
@@ -0,0 +1,46 @@
+#' To calculate the similarity matrix
+#' 
+#' calculate the affinity matrix from the diff matrix with 20 neighbors
+#' 
+#' @param Diff A diff matrix
+#' 
+#' @param K The number of neighbors in consideration
+#' 
+#' @param sigma A parameter to determine the scale
+#' 
+#' @return W The similarity matrix
+#' @author Xiaoyao Yin
+#' @examples 
+#' 
+#' data_list <- simu_data_gen()
+#' Diff <- dist2eu(Standard_Normalization(data_list[[1]]),Standard_Normalization(data_list[[1]]))
+#' simi <- affinityMatrix(Diff,20,0.5)
+#' @rdname affinityMatrix
+#' @export
+affinityMatrix <- function(Diff,K=20,sigma=0.5) {
+  N = nrow(Diff)
+
+  Diff = (Diff + t(Diff)) / 2
+  diag(Diff) = 0;
+  sortedColumns = as.matrix(t(apply(Diff,2,sort)))
+  finiteMean <- function(x) { mean(x[is.finite(x)]) }
+  means = apply(sortedColumns[,1:K+1],1,finiteMean)+.Machine$double.eps;
+
+  avg <- function(x,y) (x+y)
+  Sig = outer(means,means,avg)/3 + Diff/3 + .Machine$double.eps;
+  Sig[Sig <= .Machine$double.eps] = .Machine$double.eps
+  #densities = dnorm(Diff,0,sigma*Sig,log = FALSE)
+  densities <- exp(-Diff/(sigma*Sig))
+
+  normalize <- function(X)
+  {
+    D <- apply(X,1,sum)
+    Y <- diag(1/sqrt(D))
+    Z <- Y%*%X%*%Y
+    return (Z)
+  }
+  W = normalize(densities)
+  return(W)
+  # 
+   # return(densities)
+}
diff --git a/R/cost.R b/R/cost.R
@@ -0,0 +1,40 @@
+#' Calculate the cost
+#' 
+#' A function to calculate the cost of the objective function
+#' 
+#' @param new_WL_list A list of matrices factorized from the similarity matrices list WL 
+#' 
+#' @param init_list A list containing the updated result in this iteration
+#' 
+#' @param lambda A parameter to set the relative weight of the group sparsity constraints
+#' 
+#' @return A number indicating the total cost of the objective function
+#' @author Xiaoyao Yin
+#' @examples 
+#' 
+#' WL <- simu_data_gen()
+#' WL[[1]] <- affinityMatrix(dist2eu(Standard_Normalization(WL[[1]]),Standard_Normalization(WL[[1]])))
+#' WL[[2]] <- affinityMatrix(dist2eu(Standard_Normalization(WL[[2]]),Standard_Normalization(WL[[2]])))
+#' new_WL_list <- initialize_WL(WL)
+#' k <- 5
+#' lambda <- 0.25
+#' init_list <- initialization(new_WL_list,k)
+#' update_L_list <- update_L(new_WL_list,init_list)
+#' update_alpha_list <- update_alpha(new_WL_list,update_L_list,lambda)
+#' init_list <- update_alpha_list
+#' new_loss <- cost(new_WL_list,init_list,lambda)
+#' @rdname cost
+#' @export
+cost <- function(new_WL_list,init_list,lambda)
+{
+  res <- 0
+  NN <- length(new_WL_list)
+  alpha <- init_list[[(NN+1)]]
+  for (i in 1:NN)
+  {
+    matr_loss <- new_WL_list[[i]]-alpha%*%init_list[[i]]
+    res <- res+(norm(matr_loss,"F")^2)/2
+  }
+  res <- res+lambda*init_list[[(NN+2)]]
+  return(res)
+}
diff --git a/R/dist2bin.R b/R/dist2bin.R
@@ -0,0 +1,32 @@
+#' Calculate the agreement-based measurement
+#' 
+#' Calculate the agreement-based measurement of two any pair-wise samples x_i and x_j for binary variables
+#' 
+#' @param X A sample-feature matrix with rows as samples and columns as features
+#' 
+#' @param C The same as X
+#' 
+#' @return A matrix whose elements at (i,j) is the agreement-based measurement of two any pair-wise samples x_i and x_j
+#' @author Xiaoyao Yin
+#' @examples 
+#' 
+#' data_list <- simu_data_gen()
+#' X <- data_list[[1]]
+#' C <- X
+#' Diff <- dist2bin(X,C)
+#' @rdname dist2bin
+#' @export
+dist2bin <- function(X,C) {
+  ndata = nrow(X)
+  ncentres = nrow(C)
+
+  res = matrix(0,ndata,ncentres)
+  for (i in 1:ndata) 
+  {
+  	for (j in 1:ncentres) 
+  	{
+  		res[i,j] = sum(X[i,]!=C[j,])
+  	}
+  }
+  return(res)
+}
diff --git a/R/dist2chi.R b/R/dist2chi.R
@@ -0,0 +1,34 @@
+#' Calculate the chi-squared distance
+#' 
+#' Calculate the chi-squared distance of two any pair-wise samples x_i and x_j for discrete variables
+#' 
+#' @param X A sample-feature matrix with rows as samples and columns as features
+#' 
+#' @param C The same as X
+#' 
+#' @return A matrix whose elements at (i,j) is the chi-squared distance of two any pair-wise samples x_i and x_j
+#' @author Xiaoyao Yin
+#' @examples 
+#' 
+#' data_list <- simu_data_gen()
+#' X <- data_list[[1]]
+#' C <- X
+#' Diff <- dist2chi(X,C)
+#' @rdname dist2chi
+#' @export
+dist2chi <- function(X,C) {
+  ndata = nrow(X)
+  ncentres = nrow(C)
+  meanX = apply(X,2,mean)
+  meanC = apply(C,2,mean)
+
+  res = matrix(0,ndata,ncentres)
+  for (i in 1:ndata) 
+  {
+  	for (j in 1:ncentres) 
+  	{
+      res[i,j] = sum((X[i,]-C[j,])^2/(X[i,]+C[j,]))/2
+  	}
+  }
+  return(res)
+}