Skip to content

Commit

Permalink
version 1.0
Browse files Browse the repository at this point in the history
  • Loading branch information
yinxy1992 authored and cran-robot committed Nov 23, 2020
0 parents commit 3d54f69
Show file tree
Hide file tree
Showing 36 changed files with 1,984 additions and 0 deletions.
23 changes: 23 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
Package: M2SMJF
Title: Multi-Modal Similarity Matrix Joint Factorization
Version: 1.0
Authors@R: person("Xiaoyao", "Yin", email = "yinxy1992@sina.com", role = c("aut", "cre"))
Description: A new method to implement clustering from multiple modality data of certain samples,
the function M2SMjF() jointly factorizes multiple similarity matrices into a shared sub-matrix
and several modality private sub-matrices, which is further used for clustering. Along with
this method, we also provide function to calculate the similarity matrix and function to
evaluate the best cluster number from the original data.
Imports: dplyr, MASS, stats
Depends: R (>= 3.4.0)
License: GPL (>= 2)
Encoding: UTF-8
LazyData: true
RoxygenNote: 7.1.1
Suggests: knitr, rmarkdown
VignetteBuilder: knitr
NeedsCompilation: no
Packaged: 2020-11-20 00:49:57 UTC; DELL
Author: Xiaoyao Yin [aut, cre]
Maintainer: Xiaoyao Yin <yinxy1992@sina.com>
Repository: CRAN
Date/Publication: 2020-11-23 08:40:06 UTC
35 changes: 35 additions & 0 deletions MD5
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
0c144947c22cb86d28a3e8019dfad1a8 *DESCRIPTION
321dda5c58fc307aa9c82250e286269f *NAMESPACE
8d8239d90fee1f0fdeff0d9dd7f3dd9a *R/Cal_NMI.R
0f4d1e0f64648cf3560b5548e7a1fa8d *R/M2SMJF.R
460ebd9b20101bdf93eddbfad7abb0d4 *R/Standard_Normalization.R
48e45a970f0497430edd675c2ee7b3c5 *R/affinityMatrix.R
cd0867a38b2f9591c471fd90dc7099b8 *R/cost.R
2d6f15baa103c9f74d0f5dad0132f967 *R/dist2bin.R
6a67c51711542a9ad6847739ccce0dab *R/dist2chi.R
b10b4def4b5d466c0d356e4d3d4da943 *R/dist2eu.R
ff51d39ce5eb9db7291961e72f2cc854 *R/initialization.R
897a30e8b26b5dbb79236da735986176 *R/initialize_WL.R
89664a65d91574f54eb15457e2c04144 *R/new_modularity.R
1dbd5ddc2361ea82f336e34843564ddd *R/simu_data_gen.R
1da7d8ca2b21337c375db9cdc932b358 *R/update_L.R
8bb0a4aa64915d9d273812b7beb1c9c9 *R/update_alpha.R
575a3975ffd3213ca4cbfc8c95e28c32 *build/vignette.rds
3238ab2475e4eda0c7f0d9e3cd406daf *inst/doc/introduction.R
8caf5d73bd739b0bbea1dd5da88421ac *inst/doc/introduction.Rmd
ed87cb375c2a87afcd0e0ed198269e7f *inst/doc/introduction.html
0ea9ff08b4dba6aaf12d555005691c19 *man/Cal_NMI.Rd
c3eeb7120f0ba916c83f3232d65cf204 *man/M2SMJF.Rd
7b3b5e7c8539b89951c63b8b58f2330f *man/Standard_Normalization.Rd
114dc3134be0eef025fda4254db983bd *man/affinityMatrix.Rd
edd535b7b028ad42e3cbfccd713d6d49 *man/cost.Rd
081abdcd3ca4869da5565779db1fc0ca *man/dist2bin.Rd
88910a2ab9c51553d55b3904c23e8b86 *man/dist2chi.Rd
aa52dc00dd69beb490748c1d70a96ecf *man/dist2eu.Rd
9b23641ab445a83d3749fc768cf26877 *man/initialization.Rd
717c946279064b8237116e216d39bd0d *man/initialize_WL.Rd
cb900189b8823ac0c286f2c4ea920b95 *man/new_modularity.Rd
961dbfb0b6b602c1d1a23467de548843 *man/simu_data_gen.Rd
4ddf67926edb14c885c072bc14b4787e *man/update_L.Rd
9a9f0feb15b5c93d790bf6e07dd1ec68 *man/update_alpha.Rd
8caf5d73bd739b0bbea1dd5da88421ac *vignettes/introduction.Rmd
20 changes: 20 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Generated by roxygen2: do not edit by hand

export(Cal_NMI)
export(M2SMJF)
export(Standard_Normalization)
export(affinityMatrix)
export(cost)
export(dist2bin)
export(dist2chi)
export(dist2eu)
export(initialization)
export(initialize_WL)
export(new_modularity)
export(simu_data_gen)
export(update_L)
export(update_alpha)
import(dplyr)
importFrom(MASS,ginv)
importFrom(stats,kmeans)
importFrom(stats,rnorm)
58 changes: 58 additions & 0 deletions R/Cal_NMI.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#' calculate the normalized mutual information.
#'
#' calculate the normalized mutual information of two vectors x and y.
#'
#' @param x A vector
#'
#' @param y A vector as long as x
#'
#' @return A number between 0 and 1 indicating the normalized mutual information
#' @author Xiaoyao Yin
#' @examples
#'
#' x <- c(0.1,0.2,0.3,0.4)
#' y <- c(0.1,0.2,0.3,0.4)
#' NMI <- Cal_NMI(x,y)
#' @rdname Cal_NMI
#' @export
Cal_NMI <- function(x, y) {
x = as.vector(x)
y = as.vector(y)
return(max(0, mutualInformation(x, y)/sqrt(entropy(x) * entropy(y)), na.rm=TRUE))
}

# Calculate the mutual information between vectors x and y.
mutualInformation <- function(x, y) {
classx <- unique(x)
classy <- unique(y)
nx <- length(x)
ncx <- length(classx)
ncy <- length(classy)

probxy <- matrix(NA, ncx, ncy)
for (i in 1:ncx) {
for (j in 1:ncy) {
probxy[i, j] <- sum((x == classx[i]) & (y == classy[j])) / nx
}
}

probx <- matrix(rowSums(probxy), ncx, ncy)
proby <- matrix(colSums(probxy), ncx, ncy, byrow=TRUE)
result <- sum(probxy * log(probxy / (probx * proby), 2), na.rm=TRUE)
return(result)
}

# Calculate the entropy of vector x.
entropy <- function(x) {
class <- unique(x)
nx <- length(x)
nc <- length(class)

prob <- rep.int(NA, nc)
for (i in 1:nc) {
prob[i] <- sum(x == class[i])/nx
}

result <- -sum(prob * log(prob, 2))
return(result)
}
62 changes: 62 additions & 0 deletions R/M2SMJF.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#' the main part for M2SMJF and clustering result
#'
#' jointly factorize multiple matrices into a shared sub-matrix and multiple private sub-matrices
#'
#' @param WL A list of similarity matrices
#'
#' @param lambda A parameter to set the relative weight of the group sparsity constraints
#'
#' @param theta A parameter to determine the convergence
#'
#' @param k A parameter to specify the cluster number
#' @return A list containing the clustering result
#' \item{sub_matrices}{a list containing all the sub-matrices}
#' \item{cluster_res}{the clustering result which is as long as the number of samples}
#' @author Xiaoyao Yin
#' @examples
#'
#' WL <- simu_data_gen()
#' res <- M2SMJF(WL,0.25,10^-4,5)
#' @rdname M2SMJF
#' @export
#' @importFrom MASS ginv
#' @import dplyr
#' @importFrom stats kmeans rnorm
M2SMJF <- function(WL,lambda=0.25,theta=10^-4,k){
if (!is.list(WL))
{
stop('Error:please provide a data list by WL')
}
if (is.na(k))
{
stop('Error:please provide a cluster number by k')
}
N <- length(WL) ##length of the similarity matrices list
new_WL_list <- initialize_WL(WL) ## factorize the each of the similairty matrix Si into Ci*t(Ci)
init_list <- initialization(new_WL_list,k) ## initialize the sub-matrix Ci into alpha*Li
### minimize the objective function until convergence
divergence <- 1 ##initialize the divergence
iter_num <- 1 ##initialize the iteration number
old_cost <- 0 ##initialize the cost
while (divergence>theta) ##while not convergent
{
update_L_list <- update_L(new_WL_list,init_list) ##update all the L with update_L
update_alpha_list <- update_alpha(new_WL_list,update_L_list,lambda) ##update all the alpha with update_alpha
init_list <- update_alpha_list
new_cost <- cost(new_WL_list,init_list,lambda) ## calculate the cost of objective function
divergence <- abs((new_cost-old_cost)/new_cost) ## calculate the relative variation of cost
old_cost <- new_cost
iter_num <- iter_num+1
}
alpha <- init_list[[(N+1)]] ## sub_matrix alpha is the last one in init_list
clu_res <- kmeans(alpha,k,1000,20) ## clustering the samples by kmeans on the sub_matrix alpha, which is a conventional method in matrix factorization based clustering.
cluster_res <- clu_res$cluster ## get the clustering result, the ith element indicates the cluster of the ith sample
result <- vector("list",2)
names(result) <- c("sub_matrices","cluster_res")
sub_result <- init_list
names(sub_result) <- c(paste("sub_matrix_L",c(1:N),sep='_'),'alpha')
result[[1]] <- sub_result
result[[2]] <- cluster_res
return(result)
}

23 changes: 23 additions & 0 deletions R/Standard_Normalization.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#' Normalize the input matrix by column
#'
#' Normalize each column of x to have mean 0 and standard deviation 1.
#'
#' @param x A sample-feature matrix with rows as samples and columns as features
#'
#' @return A sample-feature matrix with rows as samples and columns as features,each column of the matrix have mean 0 and standard deviation 1
#' @author Xiaoyao Yin
#' @examples
#'
#' data_list <- simu_data_gen()
#' x <- data_list[[1]]
#' data_matrix <- Standard_Normalization(x)
#' @rdname Standard_Normalization
#' @export
Standard_Normalization = function(x) {
x = as.matrix(x);
mean = apply(x, 2, mean)
sd = apply(x, 2, sd)
sd[sd==0] = 1
xNorm = t((t(x) - mean) / sd)
return(xNorm)
}
46 changes: 46 additions & 0 deletions R/affinityMatrix.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#' To calculate the similarity matrix
#'
#' calculate the affinity matrix from the diff matrix with 20 neighbors
#'
#' @param Diff A diff matrix
#'
#' @param K The number of neighbors in consideration
#'
#' @param sigma A parameter to determine the scale
#'
#' @return W The similarity matrix
#' @author Xiaoyao Yin
#' @examples
#'
#' data_list <- simu_data_gen()
#' Diff <- dist2eu(Standard_Normalization(data_list[[1]]),Standard_Normalization(data_list[[1]]))
#' simi <- affinityMatrix(Diff,20,0.5)
#' @rdname affinityMatrix
#' @export
affinityMatrix <- function(Diff,K=20,sigma=0.5) {
N = nrow(Diff)

Diff = (Diff + t(Diff)) / 2
diag(Diff) = 0;
sortedColumns = as.matrix(t(apply(Diff,2,sort)))
finiteMean <- function(x) { mean(x[is.finite(x)]) }
means = apply(sortedColumns[,1:K+1],1,finiteMean)+.Machine$double.eps;

avg <- function(x,y) (x+y)
Sig = outer(means,means,avg)/3 + Diff/3 + .Machine$double.eps;
Sig[Sig <= .Machine$double.eps] = .Machine$double.eps
#densities = dnorm(Diff,0,sigma*Sig,log = FALSE)
densities <- exp(-Diff/(sigma*Sig))

normalize <- function(X)
{
D <- apply(X,1,sum)
Y <- diag(1/sqrt(D))
Z <- Y%*%X%*%Y
return (Z)
}
W = normalize(densities)
return(W)
#
# return(densities)
}
40 changes: 40 additions & 0 deletions R/cost.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#' Calculate the cost
#'
#' A function to calculate the cost of the objective function
#'
#' @param new_WL_list A list of matrices factorized from the similarity matrices list WL
#'
#' @param init_list A list containing the updated result in this iteration
#'
#' @param lambda A parameter to set the relative weight of the group sparsity constraints
#'
#' @return A number indicating the total cost of the objective function
#' @author Xiaoyao Yin
#' @examples
#'
#' WL <- simu_data_gen()
#' WL[[1]] <- affinityMatrix(dist2eu(Standard_Normalization(WL[[1]]),Standard_Normalization(WL[[1]])))
#' WL[[2]] <- affinityMatrix(dist2eu(Standard_Normalization(WL[[2]]),Standard_Normalization(WL[[2]])))
#' new_WL_list <- initialize_WL(WL)
#' k <- 5
#' lambda <- 0.25
#' init_list <- initialization(new_WL_list,k)
#' update_L_list <- update_L(new_WL_list,init_list)
#' update_alpha_list <- update_alpha(new_WL_list,update_L_list,lambda)
#' init_list <- update_alpha_list
#' new_loss <- cost(new_WL_list,init_list,lambda)
#' @rdname cost
#' @export
cost <- function(new_WL_list,init_list,lambda)
{
res <- 0
NN <- length(new_WL_list)
alpha <- init_list[[(NN+1)]]
for (i in 1:NN)
{
matr_loss <- new_WL_list[[i]]-alpha%*%init_list[[i]]
res <- res+(norm(matr_loss,"F")^2)/2
}
res <- res+lambda*init_list[[(NN+2)]]
return(res)
}
32 changes: 32 additions & 0 deletions R/dist2bin.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#' Calculate the agreement-based measurement
#'
#' Calculate the agreement-based measurement of two any pair-wise samples x_i and x_j for binary variables
#'
#' @param X A sample-feature matrix with rows as samples and columns as features
#'
#' @param C The same as X
#'
#' @return A matrix whose elements at (i,j) is the agreement-based measurement of two any pair-wise samples x_i and x_j
#' @author Xiaoyao Yin
#' @examples
#'
#' data_list <- simu_data_gen()
#' X <- data_list[[1]]
#' C <- X
#' Diff <- dist2bin(X,C)
#' @rdname dist2bin
#' @export
dist2bin <- function(X,C) {
ndata = nrow(X)
ncentres = nrow(C)

res = matrix(0,ndata,ncentres)
for (i in 1:ndata)
{
for (j in 1:ncentres)
{
res[i,j] = sum(X[i,]!=C[j,])
}
}
return(res)
}
34 changes: 34 additions & 0 deletions R/dist2chi.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#' Calculate the chi-squared distance
#'
#' Calculate the chi-squared distance of two any pair-wise samples x_i and x_j for discrete variables
#'
#' @param X A sample-feature matrix with rows as samples and columns as features
#'
#' @param C The same as X
#'
#' @return A matrix whose elements at (i,j) is the chi-squared distance of two any pair-wise samples x_i and x_j
#' @author Xiaoyao Yin
#' @examples
#'
#' data_list <- simu_data_gen()
#' X <- data_list[[1]]
#' C <- X
#' Diff <- dist2chi(X,C)
#' @rdname dist2chi
#' @export
dist2chi <- function(X,C) {
ndata = nrow(X)
ncentres = nrow(C)
meanX = apply(X,2,mean)
meanC = apply(C,2,mean)

res = matrix(0,ndata,ncentres)
for (i in 1:ndata)
{
for (j in 1:ncentres)
{
res[i,j] = sum((X[i,]-C[j,])^2/(X[i,]+C[j,]))/2
}
}
return(res)
}

0 comments on commit 3d54f69

Please sign in to comment.