In [1]:
library("MASS")
library("pracma")
library("data.table") #copy

“package ‘MASS’ was built under R version 4.1.3”


# simulation related 

In [2]:
#sigmas: an array of m features by k sources by k sources, indicating the k by k covariance matrix for each feature
#return the corresponding correlation matrices for all features: an array of m features by k sources by k sources
calc_corrs_from_sigmas = function(sigmas){
    
    m = dim(sigmas)[1]
    k = dim(sigmas)[2]
    
    corrs = array(0, c(m, k, k))
    for (j in 1:m){
        corrs[j,,]  = sigmas[j,,]/sqrt(diag(sigmas[j,,]) %*% t(diag(sigmas[j,,])))
    }
    dimnames(corrs)[[1]] = dimnames(sigmas)[[1]]
    dimnames(corrs)[[2]] = dimnames(sigmas)[[2]]
    dimnames(corrs)[[3]] = dimnames(sigmas)[[3]]
    
    #if there are no variation in one celltype. corresponding sigmas will be 0, related cor will be NA
    #here set those to 0
    corrs[is.na(corrs)] = 0
    return(corrs)
}

#ltri_val: numbers in the lower triangle of a matrix
#k: the dimension (number of rows or cols) of the full symmetric matrix 
#return a k by k symmetric matrix constructed from the ltri_val
fill_lower_tri = function(ltri_val, k){
    full_mat <- matrix(0,k,k)
    full_mat[lower.tri(full_mat,diag = T)] <- ltri_val
    full_mat <- t(full_mat)
    full_mat[lower.tri(full_mat,diag = T)] <- ltri_val
    return (full_mat)
}

# corrs: an array of shape m features by k sources by k sources
# return a matrix of m features by 1
# per feature, calculate entropy given correlations (k by k) matrix
calc_entropy <- function(corrs){
    m <- dim(corrs)[1]
    k <- dim(corrs)[2]
    entropies <- numeric(m)
    for (j in 1:m){
        sigma_j <- corrs[j,,]
        # von Neumann entropy for Pearson correlation matrix (https://arxiv.org/pdf/2106.05379.pdf)
        # make sure it is a valid full rank correlation matrix
        diag(sigma_j) = 1
        sigma_j_eig <- tryCatch({
            eig(sigma_j/k)
        },error=function(cond){
            print(j)
        }) 
    
    #this is trying to catch the case when the covariance matrix is not PSD
    #which might happen because of the way we are constructing the covariance matrix (by exluding different outlier samples)
    sigma_j_eig[sigma_j_eig<=0] <- 1e-5
    entropies[j] <- -sum(sigma_j_eig*log(sigma_j_eig))  
    } 
    
    entropies = as.matrix(entropies)
    rownames(entropies) = dimnames(corrs)[[1]]
    colnames(entropies) = "entropy"
    return(entropies)
}

#Z: an array of k sources by m features by n samples
#max_sds: numeric. Per (source,feature), samples whose value fall outside of max_sds will be ignored
#returns a list with the following entires: mus (m by k), sigmas (m by k by k), corrs (m by k by k), entropies (m by 1), 
#for features that are not expressed in a source. The varaince term on the diagnol is set to 0
#the correlation term is undefined, also set to 0
calc_params_from_Z <- function(Z, max_sds){
    
    k <- dim(Z)[1]
    m <- dim(Z)[2]
    n <- dim(Z)[3] 
    source.ids  = dimnames(Z)[[1]]
    feature.ids = dimnames(Z)[[2]]
    
    means  <- matrix(0,m,k)
    rownames(means) = feature.ids 
    colnames(means) = source.ids
    
    sigmas <- matrix(0,m,(choose(k,2) + k))
    for (j in 1:m){
        counter <- 1
        for (h in 1:k){    
            sd_jh <- sd(Z[h,j,])
            indices_jh <- abs(Z[h,j,] - mean(Z[h,j,])) <= max_sds*sd_jh
            #no such sample left
            if (sum(indices_jh) < 1){
                means[j,h] <- 0
            }else{
                means[j,h] <- mean(Z[h,j,indices_jh])
            }            
            for (l in h:k){
                sd_jl <- sd(Z[l,j,])
                indices_jl <- abs(Z[l,j,]-mean(Z[l,j,])) <= max_sds*sd_jl
                indices_jhl <- indices_jh & indices_jl
     
                if ((sum(indices_jhl) <=1)|(sd(Z[h,j,indices_jhl]) == 0) | (sd(Z[l,j,indices_jhl]) == 0)){
                    sigmas[j, counter] = 0
                }else{
                    sigmas[j, counter] <- cov(Z[h,j,indices_jhl],Z[l,j,indices_jhl])
                }
                counter <- counter + 1
            }
        }  
    } 
    
    sigmas.tensor = array(0,c(m,k,k))
    dimnames(sigmas.tensor)[[1]] = feature.ids 
    dimnames(sigmas.tensor)[[2]] = source.ids
    dimnames(sigmas.tensor)[[3]] = source.ids
   
    for (j in 1:m){
        sigmas.tensor[j,,] = fill_lower_tri(sigmas[j,], k)
    }
    corrs.tensor  = calc_corrs_from_sigmas(sigmas.tensor)
    
    
    params = list()
    params$sigmas    = sigmas.tensor
    params$corrs     = corrs.tensor
    params$mus       = means
    params$entropies = calc_entropy(params$corrs)
    return(params)
}

### checking  calc_corrs_from_sigmas, get_cor_mat, calc_params_from_Z, 

In [3]:
# simulate 4 sources, 1 feature, 1000 samples
sim.Z = array(0, c(4,4,1000))

# a MVN with real covar 
sim.Z[,1,] = t(mvrnorm(n = 1000, mu = c(10,20,30,40), 
                      Sigma = matrix(c(10, 0,  3,  8,
                                        0,  5,  2,  0,
                                        3,  2, 10,  0,
                                        8,  0,  0,  20), 
                                     4,4, byrow =T), 
                      tol = 1e-6, empirical = FALSE, EISPACK = FALSE))

# a MVN with diag aka independent 
sim.Z[,2,] = t(mvrnorm(n = 1000, mu = c(10,20,30,40), 
                      Sigma = diag(4), 
                      tol = 1e-6, empirical = FALSE, EISPACK = FALSE))

# a MVN with fully dependent structure 
sim.Z[,3,] = t(mvrnorm(n = 1000, mu = c(10,20,30,40), 
                      Sigma = matrix(1, 4, 4),
                      tol = 1e-6, empirical = FALSE, EISPACK = FALSE))

# a MVN with certain source having no variantion, not expressed 
sim.Z[,4,] = t(mvrnorm(n = 1000, mu = c(0,1,1,1), 
                      Sigma = matrix(c( 0, 0,  0,  0,
                                        0,  5,  2,  0,
                                        0,  2, 10,  0,
                                        0,  0,  0,  20), 
                                     4,4, byrow =T), 
                      tol = 1e-6, empirical = FALSE, EISPACK = FALSE))

In [4]:
params = calc_params_from_Z(sim.Z, max_sds = 5)

In [5]:
params$mus

0,1,2,3
10.155978,20.0345687,30.0402418,40.076474
10.013753,19.981871,30.069827,39.94747
9.972139,19.9721388,29.9721388,39.972139
0.0,0.9756215,0.9174506,0.905303


In [6]:
params$sigmas[1,,]

0,1,2,3
10.1562957,0.1554315,3.20212533,8.46605558
0.1554315,5.2394507,1.92598608,0.21365163
3.2021253,1.9259861,9.61577473,0.09983893
8.4660556,0.2136516,0.09983893,20.92974165


In [7]:
params$sigmas[2,,]

0,1,2,3
0.92327085,0.04059547,0.04379319,0.0262497
0.04059547,1.00307523,0.01085054,-0.02759156
0.04379319,0.01085054,0.99966492,-0.01909739
0.0262497,-0.02759156,-0.01909739,1.04161007


In [8]:
params$sigmas[3,,]

0,1,2,3
1.04589,1.04589,1.04589,1.04589
1.04589,1.04589,1.04589,1.04589
1.04589,1.04589,1.04589,1.04589
1.04589,1.04589,1.04589,1.04589


In [9]:
params$sigmas[4,,]

0,1,2,3
0,0.0,0.0,0.0
0,4.95481057,2.2675655,0.05765383
0,2.26756553,10.4649995,-0.70272869
0,0.05765383,-0.7027287,20.2226572


In [10]:
params$corrs[1,,]

0,1,2,3
1.0,0.0213073,0.324025,0.58067284
0.0213073,1.0,0.27134269,0.02040241
0.324025,0.27134269,1.0,0.00703762
0.5806728,0.02040241,0.00703762,1.0


In [11]:
params$corrs[2,,]

0,1,2,3
1.0,0.04218386,0.04558424,0.02676746
0.04218386,1.0,0.01083571,-0.02699336
0.04558424,0.01083571,1.0,-0.01871519
0.02676746,-0.02699336,-0.01871519,1.0


In [12]:
params$corrs[3,,]

0,1,2,3
1,1,1,1
1,1,1,1
1,1,1,1
1,1,1,1


In [13]:
params$corrs[4,,]

0,1,2,3
0,0.0,0.0,0.0
0,1.0,0.31490294,0.005759642
0,0.314902937,1.0,-0.048305765
0,0.005759642,-0.04830577,1.0


### check calc_entropy

In [14]:
dim(params$corrs)

In [15]:
calc_entropy(params$corrs)

entropy
1.2467631851
1.3848437683
0.0001151293
1.3604406683


### testing when there is no sample left like when max_sds is set to super tiny number

In [16]:
# testing when there is no sample left like when max_sds is set to super tiny number
sim.Z = array(0, c(4,1,10))
sim.Z[,1,] = t(mvrnorm(n = 10, mu = c(0,1,1,1), 
                      Sigma = matrix(c( 10, 0,  0,  0,
                                        0,  5,  2,  0,
                                        0,  2, 10,  0,
                                        0,  0,  0,  20), 
                                     4,4, byrow =T), 
                      tol = 1e-6, empirical = FALSE, EISPACK = FALSE))

In [17]:
params = calc_params_from_Z(sim.Z, max_sds = 0.01)

In [18]:
params$mus

0,1,2,3
0,0,0,0


In [19]:
params$sigmas[1,,]

0,1,2,3
0,0,0,0
0,0,0,0
0,0,0,0
0,0,0,0


In [20]:
params$corrs[1,,]

0,1,2,3
0,0,0,0
0,0,0,0
0,0,0,0
0,0,0,0


In [21]:
params$entropies

entropy
1.386294


# Tensor numerical stability

In [22]:
# data: any data structure, could be 2d matrix or 3d array 
# min.val: numeric. minimal absolute value to be capped 
# max.val: numeric. maximal absolute value to be capped 
# NA in the data structure is turned into + min.val
# absolute values larger than max.val or smaller than min.val (including Inf, -Inf)are replaced by max.val and min.val
# sign information is also preserved
cap_values = function(data, min.val = 10**(-4), max.val = 10**(4)){
    if(sum(is.na(data)| is.infinite(data)) != 0){
        print(paste0("there are NAN: ", sum(is.na(data))))
        data[is.na(data)] <- min.val
    }
    if(sum(abs(data) < min.val) != 0){
        print(paste0("there are extrmemely close to 0 values: ", sum(abs(data) < min.val)))
        sign.info = sign(data[which(abs(data) < min.val)])
        data[which(abs(data) < min.val)] <- sign.info * min.val
    }
    if(sum(abs(data) > max.val) != 0){
        print(paste0("there are extrmemely large values: " , sum(abs(data) < max.val)))
        sign.info = sign(data[which(abs(data) > max.val)])
        data[which(abs(data) > max.val)] <- sign.info * max.val
    }
    return(data)
} 

In [23]:
cap_values(matrix(c(-1,  1,   -2,    2,
                    NA,  NA,  Inf, -Inf,
                    -10, 10,  -100, 100), nrow = 3, byrow = T),  min.val = 1, max.val = 5)

[1] "there are NAN: 2"
[1] "there are extrmemely large values: 6"


0,1,2,3
-1,1,-2,2
1,1,5,-5
-5,5,-5,5


In [24]:
cap_values(matrix(c(-1,  1,   -2,    2,
                    NA,  NA,  Inf, -Inf,
                    -10, 10,  -100, 100), nrow = 3, byrow = T),  min.val = 0.5, max.val = 20)

[1] "there are NAN: 2"
[1] "there are extrmemely large values: 8"


0,1,2,3
-1.0,1.0,-2,2
0.5,0.5,20,-20
-10.0,10.0,-20,20


In [25]:
sign(matrix(c(-1,  1,   -2,    2,
                    NA,  NA,  Inf, -Inf,
                    -10, 10,  -100, 100), nrow = 3, byrow = T))

0,1,2,3
-1.0,1.0,-1,1
,,1,-1
-1.0,1.0,-1,1


# TCA result format 

In [26]:
# Z: TCA's tensor in the form of a list of matrices. each list element is a celltype. each matrix is features by samples
# source.ids: a list of charactors, indicating the sources/celltypes
# turn TCA's tensor Z to an 3d array format with all the dimnames 
list_2_array = function(Z, source.ids){
    
    k = length(Z)
    m = dim(Z[[1]])[1]
    n = dim(Z[[1]])[2]
    Z.array = array(0, c(k, m, n))
    
    dimnames(Z.array)[[1]] = source.ids
    dimnames(Z.array)[[2]] = rownames(Z[[1]])
    dimnames(Z.array)[[3]] = colnames(Z[[1]])
    
    for(h in 1:k){
        Z.array[h,,] = Z[[h]]
    }
    return(Z.array)
}

In [27]:
s1 = matrix(1, 20, 10)
rownames(s1) = paste("feature.",1:20)
colnames(s1) = paste("sample.",1:10)
s2 = copy(s1)

Z.hat = list_2_array(list(s1, s2), source.ids = c("source.1", "source.2"))

In [28]:
str(Z.hat)

 num [1:2, 1:20, 1:10] 1 1 1 1 1 1 1 1 1 1 ...
 - attr(*, "dimnames")=List of 3
  ..$ : chr [1:2] "source.1" "source.2"
  ..$ : chr [1:20] "feature. 1" "feature. 2" "feature. 3" "feature. 4" ...
  ..$ : chr [1:10] "sample. 1" "sample. 2" "sample. 3" "sample. 4" ...


In [29]:
Z.hat[,1,]

Unnamed: 0,sample. 1,sample. 2,sample. 3,sample. 4,sample. 5,sample. 6,sample. 7,sample. 8,sample. 9,sample. 10
source.1,1,1,1,1,1,1,1,1,1,1
source.2,1,1,1,1,1,1,1,1,1,1


In [30]:
Z.hat[1,,]

Unnamed: 0,sample. 1,sample. 2,sample. 3,sample. 4,sample. 5,sample. 6,sample. 7,sample. 8,sample. 9,sample. 10
feature. 1,1,1,1,1,1,1,1,1,1,1
feature. 2,1,1,1,1,1,1,1,1,1,1
feature. 3,1,1,1,1,1,1,1,1,1,1
feature. 4,1,1,1,1,1,1,1,1,1,1
feature. 5,1,1,1,1,1,1,1,1,1,1
feature. 6,1,1,1,1,1,1,1,1,1,1
feature. 7,1,1,1,1,1,1,1,1,1,1
feature. 8,1,1,1,1,1,1,1,1,1,1
feature. 9,1,1,1,1,1,1,1,1,1,1
feature. 10,1,1,1,1,1,1,1,1,1,1


In [19]:
# mat: a matrix that is number of features by number of sources
# return a tensor that is number of features by number of sources by number of sources
# per feature, it turns the original vector (of length number of sources) to entries in a diagnol matrix
matrix_to_diag_tensor = function(mat){
    m = nrow(mat)
    k = ncol(mat)
    feature.ids = rownames(mat)
    source.ids  = colnames(mat)
    
    dtensor = array(0, c(m,k,k))
    dimnames(dtensor)[[1]] = feature.ids
    dimnames(dtensor)[[2]] = source.ids
    dimnames(dtensor)[[3]] = source.ids
    
    for (j in 1:m){
        dtensor[j,,] = diag(mat[j,])
    }
    return(dtensor) 
}

In [20]:
sigmas_hat = matrix(c(1, 2, 3, 4, 5, 6),ncol = 3, byrow = T)
rownames(sigmas_hat) = paste0("feature.", 1:2)
colnames(sigmas_hat) = paste0("source.", 1:3)

In [21]:
sigmas_hat

Unnamed: 0,source.1,source.2,source.3
feature.1,1,2,3
feature.2,4,5,6


In [22]:
matrix_to_diag_tensor(sigmas_hat)[1,,]

Unnamed: 0,source.1,source.2,source.3
source.1,1,0,0
source.2,0,2,0
source.3,0,0,3


In [23]:
matrix_to_diag_tensor(sigmas_hat)[2,,]

Unnamed: 0,source.1,source.2,source.3
source.1,4,0,0
source.2,0,5,0
source.3,0,0,6


# Evaluation related 

In [2]:
# x: list of numbers  
# y: list of numbers 
# robust: boolean. indicate if using robust correlation
# qtl: numeric between (0,1), only meaningful when robust is turned on. indicate the fraction of the data that is considered to be "good portion" and will participate in the correlation calculation
# check the correlation between x and y after excluding outliers defined by the qtl
# note that if in IQR is 0 in either x or y, OR x and y are completely colinear, robust correlation will be NA and thus filled by 0
safe_cor = function(x, y, robust = FALSE, qtl = 0.95){
    res = tryCatch({
        if(robust){
            cov.rob(cbind(x,y), cor = TRUE, quantile.used = round(qtl*length(x)) ,method = "mve")$cor[1,2] 
        }else{
            cor(x,y)
        }
        
    }, error = function(cond){
        0
    })
    
    # in regular correlation if all constant, then it is going to be NA
    if (is.na(res)){
        return(0)
    }else{
        return(res)
    }
    
}

In [3]:
# if use all observations, not robust
safe_cor(c(1,2,3,4,5,6,7,8,9,1000), c(1,2,3,4,5,6,7,8,9, -1000), F, 1)

In [4]:
# if use all observations, robust, expect to have an error and return 0, need to exlude at least 1 observation
safe_cor(c(1,2,3,4,5,6,7,8,9,1000), c(1,2,3,4,5,6,7,8,9, -1000), T, 1)

In [5]:
# if use good portion of observations
safe_cor(c(1,2,3,4,5,6,7,8,9,1000), c(1,2,3,4,5,6,7,8,9, -1000), T, 0.9)

In [6]:
# completely colinear raw x and y will trigger this to be NA and filled by 0
safe_cor(c(1,2,3,4,5,6,7,8,9,1000), c(1,2,3,4,5,6,7,8,9, 1000), T, 0.9)

In [13]:
#slightly not colinear will do 
safe_cor(c(1,2,3,4,5,6,7,8,9,1000), c(2,3,4,5,6,7,8,9,10,1000), T, 0.9)

In [9]:
# if constant, return 0
safe_cor(rep(1, 10), rep(2, 10), T, 0.9)

In [10]:
# if constant, return 0
safe_cor(rep(1, 10), rep(2, 10), F, 0.9)

“the standard deviation is zero”


In [14]:
# Z.true is an array of k by m by n, the true tensor 
# Z.hat is an array of k by m by n, the estimated tensor 
# eval.feature.source is a matrix of logical values that is number of features by number of sources. only calculate correlation on feature-source that has TRUE in this matrix 
# robust: boolean. indicate if using robust correlation
# qtl: numeric between (0,1), only meaningful when robust is turned on. indicate the fraction of the data that is considered to be "good portion" and will participate in the correlation calculation
# return a m by k correltion matrix, each column is a different source and each row is a different feature, 
# each entry is a correlation score, indicating that feautre, that source's tensor estimated across samples
# those entry with FALSE in the eval.feature.source is going to be set to NA 
calc_Z_corrs <- function(Z.true, Z.hat, eval.feature.source = NULL, robust = TRUE, qtl = 0.95){
    
    m = dim(Z.true)[2]
    k = dim(Z.true)[1]
    Z.corrs = matrix(NA, m, k)
    rownames(Z.corrs) = dimnames(Z.true)[[2]]
    colnames(Z.corrs) = dimnames(Z.true)[[1]]
    
    # if eval.feature.source is not present, return all results
    if(is.null(eval.feature.source)){
        eval.feature.source = matrix(TRUE, m, k)
    }
    
    for (h in 1:k){
        for (j in 1:m){
            if(eval.feature.source[j,h]){
                Z.corrs[j,h] <- safe_cor(Z.true[h,j,], Z.hat[h,j,], robust, qtl)
            }else{
                Z.corrs[j,h] <- NA
            }
                
        }
    }
    return(Z.corrs)
}

In [18]:
Z.true = array(c(1, 2.1, 3.2, 4.1, 5.2, 6.3, 7.4, 8.2, 9.1, 10.9,
                 100, 2, 3,     4, 5,     6, 7,     8,   9, 10), 
               c(1,2,10))
dimnames(Z.true)[[1]] = c("source.1")
dimnames(Z.true)[[2]] = c("feature.1", "feature.2")
dimnames(Z.true)[[3]] = c(paste0("sample", 1:10))

Z.hat = array(c(  -1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
                -100, 2, 3, 4, 5, 6, 7, 8, 9, 10), c(1,2,10))

In [19]:
calc_Z_corrs(Z.true, Z.hat, eval.feature.source = NULL, robust = T, qtl = 0.9)

Unnamed: 0,source.1
feature.1,0.9869641
feature.2,0.9992373


In [20]:
# if eval.feature.source is set to FALSE, that entry will not be evaled and set to NA
calc_Z_corrs(Z.true, Z.hat, eval.feature.source = matrix(c(T, F), 2, 1), robust = T, qtl = 0.9)

Unnamed: 0,source.1
feature.1,0.9869641
feature.2,


# moments correlation

In [21]:
#source.ids: list of sources
#generate all the unique combination of 2 difference sources  
get_covar_ids = function(source.ids){
    k = length(source.ids)
    covar.ids = list(choose( k,2))
    counter = 1
    for (h1 in 1:( k -1)){
        for (h2 in (h1 + 1): k){
            covar.ids[counter] = paste(source.ids[h1], source.ids[h2], sep = "-")
            counter = counter +1
        }
    }
    return(unlist(covar.ids))
}

In [22]:
get_covar_ids(c("a", "b", "c"))

In [23]:
for (covar.id in get_covar_ids(c("a", "b", "c"))){
    covar.pair = strsplit(covar.id,"-")[[1]]
    print(covar.pair[1])
    print(covar.pair[2])
}

[1] "a"
[1] "b"
[1] "a"
[1] "c"
[1] "b"
[1] "c"


In [24]:
# params.true: a list of 2 keys: mus, sigmas. mus is number of features by number of sources. sigmas is number of features by source by source. these are true parameters
# params.hat: similar to params.true but are the estimated 
# robust: boolean if to use robust correlation. default is TRUE
# qtl: numeric, the quantile of robust correlation, default is 0.95
get_moment_corrs = function (params.true, params.hat, robust = TRUE, qtl = 0.95){
            
    feature.ids = rownames(params.true$mus)
    source.ids  = colnames(params.true$mus)
    covar.ids   = get_covar_ids(source.ids)
    
    mus.rob.corrs   = matrix(0, 1, length(source.ids))
    colnames(mus.rob.corrs)   = source.ids
    rownames(mus.rob.corrs)   = c("mus.rob.corrs")
    
    var.rob.corrs   = matrix(0, 1, length(source.ids))
    colnames(var.rob.corrs)   = source.ids
    rownames(var.rob.corrs)   = c("var.rob.corrs")
    
    covar.rob.corrs = matrix(0, 1, choose(length(source.ids), 2))
    colnames(covar.rob.corrs) = covar.ids
    rownames(covar.rob.corrs) = c("covar.rob.corrs")

    for (h in 1:length(source.ids)){
        mus.rob.corrs[1,h]  = safe_cor(params.true$mus[,h], 
                                       params.hat$mus[,h], robust, qtl)
        
        var.rob.corrs[1,h]  = safe_cor(params.true$sigmas[,h,h], 
                                       params.hat$sigmas[,h,h], robust, qtl)
    }
    for (covar.id in covar.ids){
        covar.pair = strsplit(covar.id,"-")[[1]]
        covar.rob.corrs[1,covar.id] = safe_cor(params.true$sigmas[,covar.pair[1],covar.pair[2]], 
                                               params.hat$sigmas[,covar.pair[1],covar.pair[2]], robust, qtl)
    }
                      
    return(list(mus.rob.corrs   = mus.rob.corrs,         
                var.rob.corrs   = var.rob.corrs,      
                covar.rob.corrs = covar.rob.corrs))                    
}

In [25]:
k = 5
m = 600
n = 500

source.ids  = paste0("source.", 1:k)
feature.ids = paste0("feature.", 1:m)
sample.ids  = paste0("sample.", 1:n)

mus.true  = matrix (runif(m*k, 0, 1), m, k)
rownames(mus.true) = feature.ids
colnames(mus.true) = source.ids

mus.hat = mus.true + matrix (runif(m*k, 0, 0.1), m, k)

sigmas.true = array(runif(m*k*k, 0, 1), c(m,k,k))
dimnames(sigmas.true)[[1]] = feature.ids
dimnames(sigmas.true)[[2]] = source.ids
dimnames(sigmas.true)[[3]] = source.ids

# add noise and deliberately set a few entries to nonsense 
sigmas.hat = sigmas.true + array(runif(m*k*k, 0, 0.1), c(m,k,k))
sigmas.hat[,1,1] = runif(m, 0, 1)

sigmas.hat[,1,2] = runif(m, 0, 1)
sigmas.hat[,2,1] = runif(m, 0, 1)

sigmas.hat[,2,3] = runif(m, 0, 1)
sigmas.hat[,3,2] = runif(m, 0, 1)

sigmas.hat[,2,4] = runif(m, 0, 1)
sigmas.hat[,4,2] = runif(m, 0, 1)

sigmas.hat[,3,5] = runif(m, 0, 1)
sigmas.hat[,5,3] = runif(m, 0, 1)

params.true = list(mus = mus.true, sigmas = sigmas.true)
params.hat  = list(mus = mus.hat, sigmas = sigmas.hat)

In [26]:
moment.corrs = get_moment_corrs(params.true, params.hat, robust = T)

In [27]:
# estimated correlation matches the set up. only those that are deliberatly set to nonsense gets super low score
moment.corrs

Unnamed: 0,source.1,source.2,source.3,source.4,source.5
mus.rob.corrs,0.9951468,0.995078,0.9952077,0.9955865,0.995173

Unnamed: 0,source.1,source.2,source.3,source.4,source.5
var.rob.corrs,-0.02043646,0.9948213,0.9946894,0.9952005,0.9947666

Unnamed: 0,source.1-source.2,source.1-source.3,source.1-source.4,source.1-source.5,source.2-source.3,source.2-source.4,source.2-source.5,source.3-source.4,source.3-source.5,source.4-source.5
covar.rob.corrs,-0.078437,0.9950331,0.9948563,0.9953406,0.09902901,-0.01512013,0.9953173,0.9949377,-0.05831088,0.9953077


In [28]:
# mdl.list: a list of models, each represent one run
# key: character, the name of the variable to extract. 
# return concated version of that varibale, stacked vertically. columns remain the same
concat_key = function(mdl.list, key){
    res = list(length(mdl.list))
    for (t in 1:length(mdl.list)){
        res[[t]] = mdl.list[[t]][[key]]
    }
    res = Reduce(rbind, res)
    return(res)
}

# mdl.list: a list of models, each represent one run
# key1: character, the first name of the variable to extract. 
# key2: character, the second name of the variable to extract. 
# return concated version of that varibale, stacked vertically. columns remain the same
concat_2_keys = function(mdl.list, key1, key2){
    res = list(length(mdl.list))
    for (t in 1:length(mdl.list)){
        res[[t]] = mdl.list[[t]][[key1]][[key2]]
    }
    res = Reduce(rbind, res)
    return(res)
}

In [29]:
mdl.list = list()
mdl.list[[1]] = list(mus = matrix (runif(10*5, 0, 1), 10, 5),
                     moment.recon.corrs = list(mus = matrix (runif(5, 0, 1), 1, 5),
                                         var = matrix (runif(5, 0, 1), 1, 5)))

mdl.list[[2]] = list(mus = matrix (runif(10*5, 0, 1), 10, 5),
                     moment.recon.corrs = list(mus = matrix (runif(5, 0, 1), 1, 5),
                                         var = matrix (runif(5, 0, 1), 1, 5)))

In [30]:
str(mdl.list)

List of 2
 $ :List of 2
  ..$ mus               : num [1:10, 1:5] 0.9151 0.0284 0.8564 0.5953 0.6996 ...
  ..$ moment.recon.corrs:List of 2
  .. ..$ mus: num [1, 1:5] 0.359 0.214 0.664 0.406 0.763
  .. ..$ var: num [1, 1:5] 0.617 0.406 0.482 0.237 0.24
 $ :List of 2
  ..$ mus               : num [1:10, 1:5] 0.212 0.982 0.98 0.172 0.542 ...
  ..$ moment.recon.corrs:List of 2
  .. ..$ mus: num [1, 1:5] 0.254 0.0571 0.2576 0.0705 0.596
  .. ..$ var: num [1, 1:5] 0.925 0.752 0.384 0.158 0.135


In [31]:
concat_key(mdl.list,"mus")

0,1,2,3,4
0.915120249,0.847544433,0.94301872,0.702532472,0.83307632
0.028412883,0.102523704,0.95303982,0.457286095,0.67995812
0.856423965,0.357401809,0.70727609,0.332869947,0.48777865
0.595315177,0.057135828,0.18137668,0.090360845,0.93684311
0.699557907,0.502508165,0.42942553,0.939525913,0.45776004
0.118900904,0.685385701,0.65910546,0.323143884,0.93571168
0.507680171,0.363697683,0.82428247,0.357678541,0.71698808
0.785444007,0.327729658,0.3719597,0.513151643,0.98487607
0.252443637,0.215316348,0.91011644,0.504627376,0.06649987
0.007464877,0.088174807,0.51932122,0.563663266,0.56725766


In [32]:
concat_2_keys(mdl.list,"moment.recon.corrs","mus")

0,1,2,3,4
0.3591791,0.21429001,0.6644137,0.40622711,0.7625677
0.2540414,0.05712932,0.2575532,0.07052129,0.5959777


# negative values in Z

In [41]:
#Z: a 3D array of sources by features by samples
#return a new version with no negative values, per source, feature, if the minimal value is below zero, 
#shift the entire distribution across all samples by the - of the minimal value  
none_neg_Z = function(Z){
    k = dim(Z)[1]
    m = dim(Z)[2]
    n = dim(Z)[3]
    counter = 0
    
    for (l in 1:k){
        for (j in 1:m){
            sample.min = min(Z[l,j,])
            if(sample.min < 0){
                counter = counter + 1
                #move entire distribution to non-neg range 
                Z[l,j,] = Z[l,j,] + (-sample.min)
            }
        }
    }
    
    message(paste0(round((counter/(m * k)) * 100, 2), " percent of the feature-source are shifted to be non negative"))
    return(Z)
}

In [42]:
Z.hat = array(-1, c(5, 100, 20))

In [43]:
Z.hat[1,1,1] = -100

In [44]:
Z.hat[1,,]

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
-100,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1


In [45]:
# make sure shifted by that feature-source's min sample 
none_neg_Z(Z.hat)[1,,]

100 percent of the feature-source are shifted to be non negative



0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
