In [1]:
library(data.table)
source("analysis.utils.r")

set.seed(2023)

“package ‘MASS’ was built under R version 4.1.3”


In [2]:
feature.set = "hvf.10k"
#feature.set = "random.10k"

robust = T
qtl = 0.95

In [3]:
data.dir = paste0("../Data//Methylation/Purified-Reinius/", feature.set)
res.dir  = paste0("../Result//Methylation/Purified-Reinius/", feature.set)

if (!file.exists(res.dir)){print("no result yet")}
print(data.dir)
print(res.dir)

[1] "../Data//Methylation/Purified-Reinius/hvf.10k"
[1] "../Result//Methylation/Purified-Reinius/hvf.10k"


# load data

In [4]:
hannum  = readRDS(file.path(data.dir, paste0("hannum.", feature.set, ".rds")))
reinius = readRDS(file.path(data.dir, paste0("reinius.", feature.set, ".rds")))

In [5]:
#algorithm
mean_penalty  = 0
var_penalty   = 0.01
covar_penalty = 0.01

max_stds = 2
#max_stds = 3

mean_max_iterations = 2 
var_max_iterations = 3
nloptr_opts_algorithm = "NLOPT_LN_COBYLA"

pen.config = paste("mp", mean_penalty, "vp", var_penalty, "cp", covar_penalty, "maxStds", max_stds, sep =  "_")
print(pen.config)

[1] "mp_0_vp_0.01_cp_0.01_maxStds_2"


In [6]:
base.mdl      = readRDS(file.path(res.dir, paste0("base.mdl.rds")))
cibersortx.mdl= readRDS(file.path(res.dir, paste0("cibersortx.mdl.rds")))
tca.mdl       = readRDS(file.path(res.dir, paste0("tca.mdl.rds")))
tcax.mdl      = readRDS(file.path(res.dir, paste0("tcax.mdl.", pen.config, ".rds")))
bMIND.mdl      = readRDS(file.path(res.dir, paste0("bMIND.mdl.rds")))

# Eval helper

In [7]:
#after adjust for source, feature specific mean, 
#collapse all features to one meta feature
#return a list with two key, ref and hat. 
#each key is just a long meta vector, essentially one meta feature for this source
calc_centered_Z_h = function(Z.ref, Z.hat, h, mask){
    n = dim(Z.ref)[3]
    mask = as.vector(mask)
    
    #source, feature specific mean
    mus.ref = as.matrix(rowMeans(Z.ref[h,mask,]))
    mus.hat = as.matrix(rowMeans(Z.hat[h,mask,]))
    
    #collapse
    ref = as.vector(Z.ref[h,mask,] - repmat(mus.ref, 1, n))
    hat = as.vector(Z.hat[h,mask,] - repmat(mus.hat, 1, n))
    return (data.frame(ref = ref, hat = hat))  
}

In [8]:
#Z.ref: ground truth tensor with propor dim names: #sources by #features by #samples
#Z.hat: estimated tensor with propor dim names: #sources by #features by #samples
#group.mat:  matrix. number of features by number of different stratification.
#            each column should be booleans indicating if we should keep/True or exclude/false under that column name
#qtl: a number between 0 and 1. indicating the good portion of the data 
#returns a list with 5 elements: robcorrelation, mean and median absolute error, root mean and median square error
#each element is number of sources by number of stratification. essetially we have one meta feature per source now
calc_center_Z_hat_metrics = function(Z.ref, Z.hat, group.mat, qtl){
    
    center.Z.corrs           = matrix(0, dim(Z.ref)[1], ncol(group.mat))
    rownames(center.Z.corrs) = dimnames(Z.ref)[[1]]
    colnames(center.Z.corrs) = colnames(group.mat)  
   
    
    center.Z.MAE   = copy(center.Z.corrs)
    center.Z.MedAE = copy(center.Z.corrs)
    center.Z.RMS   = copy(center.Z.corrs)
    center.Z.RMedS = copy(center.Z.corrs)
    
    
    for (h in 1:dim(Z.ref)[1]){
        for (mask.i in 1:ncol(group.mat)){
           
            res = calc_centered_Z_h(Z.ref, Z.hat, h, mask = group.mat[,mask.i])
            ref = res$ref
            hat = res$hat
           
            center.Z.corrs[h,mask.i] = safe_cor(ref, hat, robust = T, qtl = qtl)
           
            center.Z.MAE[h,mask.i]   = mean(abs(ref - hat))
            center.Z.MedAE[h,mask.i] = median(abs(ref - hat))
            
            center.Z.RMS[h,mask.i]   = sqrt(mean((ref - hat)**2))
            center.Z.RMedS[h,mask.i] = sqrt(median((ref - hat)**2))
        }
    }
    
    metrics = list(center.Z.corrs = center.Z.corrs,
                   center.Z.MAE  = center.Z.MAE, center.Z.MedAE = center.Z.MedAE,
                   center.Z.RMS  = center.Z.RMS, center.Z.RMedS = center.Z.RMedS)
   
    return(metrics)
}

In [9]:
#Z.ref: ground truth tensor with propor dim names: #sources by #features by #samples
#Z.hat: estimated tensor with propor dim names: #sources by #features by #samples
#group.mat:  matrix. number of features by number of different stratification.
#            each column should be booleans indicating if we should keep/True or exclude/false under that column name
#qtl: a number between 0 and 1. indicating the good portion of the data 
#sample.itr: number of sampling to do 
#sample.size: number of samples to draw at each iteration 
#seed.number: default to 2023
#return a list, each entry is a random sample's 5 metrics caluclated using calc_center_Z_hat_metrics
calc_center_Z_hat_metrics_CI = function(Z.ref, Z.hat, group.mat, qtl, sample.itr, sample.size, seed.number = 2023){
    set.seed(seed.number)
    
    feature.ids = dimnames(Z.ref)[[2]]
    if (sample.itr*sample.size > length(feature.ids)){
        message("sample with replacement")
        feature.ids = sample(feature.ids, sample.itr * sample.size, replace = T)
    }else{
        message("sample without replacement")
        feature.ids = sample(feature.ids, sample.itr * sample.size, replace = F)
    }

    features.list = list()
    metrics.list = list()
    for (t in 1:sample.itr){
        features.list[[t]] = feature.ids[((t-1)*sample.size + 1):(t*sample.size)]
        metrics.list[[t]]  = calc_center_Z_hat_metrics(Z.ref[,features.list[[t]],], 
                                                       Z.hat[,features.list[[t]],], 
                                                       group.mat[features.list[[t]], ], qtl)
    }
    res = list(features.list = features.list, 
               metrics.list  = metrics.list)
    
    return(res)
}

# Eval

In [10]:
str(reinius)

List of 5
 $ X     : num [1:10000, 1:6] 0.9149 0.0436 0.8372 0.8827 0.1134 ...
  ..- attr(*, "dimnames")=List of 2
  .. ..$ : chr [1:10000] "cg12426467" "cg02113055" "cg00079898" "cg04131969" ...
  .. ..$ : chr [1:6] "sample.1" "sample.2" "sample.3" "sample.4" ...
 $ cov   : chr [1:6, 1] "Male" "Male" "Male" "Male" ...
  ..- attr(*, "dimnames")=List of 2
  .. ..$ : chr [1:6] "GSM861635" "GSM861636" "GSM861637" "GSM861638" ...
  .. ..$ : chr "gender"
 $ W     : num [1:6, 1:6] 0.532 0.59 0.665 0.617 0.775 ...
  ..- attr(*, "dimnames")=List of 2
  .. ..$ : chr [1:6] "sample.1" "sample.2" "sample.3" "sample.4" ...
  .. ..$ : chr [1:6] "Gran" "CD8T" "CD4T" "Mono" ...
 $ Z     : num [1:6, 1:10000, 1:6] 0.912 0.922 0.919 0.923 0.919 ...
  ..- attr(*, "dimnames")=List of 3
  .. ..$ : chr [1:6] "Gran" "CD8T" "CD4T" "Mono" ...
  .. ..$ : chr [1:10000] "cg12426467" "cg02113055" "cg00079898" "cg04131969" ...
  .. ..$ : chr [1:6] "sample.1" "sample.2" "sample.3" "sample.4" ...
 $ params:List of 4
 

In [11]:
feature.ids = dimnames(reinius$Z)[[2]]
source.ids  = dimnames(reinius$Z)[[1]]

group.mat = matrix(FALSE, length(feature.ids), 3)
rownames(group.mat) = feature.ids
colnames(group.mat) = c("low entropy", "high entropy", "all")
group.mat[,"low entropy"]  = reinius$params$entropies <= quantile(reinius$params$entropies, 0.5)
group.mat[,"high entropy"] = reinius$params$entropies >  quantile(reinius$params$entropies, 0.5)
group.mat[,"all"] = TRUE

In [12]:
group.mat

Unnamed: 0,low entropy,high entropy,all
cg12426467,TRUE,FALSE,TRUE
cg02113055,TRUE,FALSE,TRUE
cg00079898,TRUE,FALSE,TRUE
cg04131969,TRUE,FALSE,TRUE
cg06193597,TRUE,FALSE,TRUE
cg00944631,TRUE,FALSE,TRUE
cg08052546,FALSE,TRUE,TRUE
cg02872767,TRUE,FALSE,TRUE
cg06064954,TRUE,FALSE,TRUE
cg04245305,TRUE,FALSE,TRUE


In [13]:
colSums(group.mat)

In [14]:
calc_centered_Z_h(Z.ref = reinius$Z, Z.hat = base.mdl$Z.hat, h = 1, mask  = group.mat[,3])

ref,hat
<dbl>,<dbl>
0.271177938,0.026226772
-0.508662647,-0.314427153
0.227830783,-0.013904993
0.414142792,0.193850603
-0.307367267,-0.243692806
0.255153543,0.085797822
0.191962010,0.193972850
-0.356465917,-0.361270155
0.207939917,-0.051927227
0.136694933,-0.025745972


In [15]:
base.mdl$evals = calc_center_Z_hat_metrics_CI(Z.ref = reinius$Z, 
                                              Z.hat = base.mdl$Z.hat, 
                                              group.mat = group.mat, qtl = qtl,
                                              sample.itr = 20, sample.size = 1000)

cibersortx.mdl$evals = calc_center_Z_hat_metrics_CI(Z.ref = reinius$Z, 
                                                    Z.hat = cibersortx.mdl$Z.hat, 
                                                    group.mat = group.mat, qtl = qtl,
                                                    sample.itr = 20, sample.size = 1000)

tca.mdl$evals = calc_center_Z_hat_metrics_CI(Z.ref = reinius$Z, 
                                              Z.hat = tca.mdl$Z.hat, 
                                              group.mat = group.mat, qtl = qtl,
                                              sample.itr = 20, sample.size = 1000)

tcax.mdl$evals = calc_center_Z_hat_metrics_CI(Z.ref = reinius$Z, 
                                              Z.hat = tcax.mdl$Z.hat, 
                                              group.mat = group.mat, qtl = qtl,
                                              sample.itr = 20, sample.size = 1000)






bMIND.mdl$evals = calc_center_Z_hat_metrics_CI(Z.ref = reinius$Z, 
                                              Z.hat = bMIND.mdl$Z.hat, 
                                              group.mat = group.mat, qtl = qtl,
                                              sample.itr = 20, sample.size = 1000)


sample with replacement

sample with replacement

sample with replacement

sample with replacement

sample with replacement



### Rob Cor

In [16]:
base.mdl$evals$metrics.list[[1]]$center.Z.corrs

Unnamed: 0,low entropy,high entropy,all
Gran,0.76019858,0.23500075,0.516094
CD8T,0.20717848,-0.41158796,-0.1428788
CD4T,0.41196755,-0.1047564,0.1614813
Mono,0.4455769,0.03593704,0.2481545
NK,0.08059443,-0.7506502,-0.4152279
B,0.32684804,0.62888552,0.5008448


In [17]:
cibersortx.mdl$evals$metrics.list[[1]]$center.Z.corrs

Unnamed: 0,low entropy,high entropy,all
Gran,0.9038642,0.0,0.8359301
CD8T,0.0,0.2803589,0.0
CD4T,0.0,0.0,0.0
Mono,0.0,0.0,0.0
NK,0.0,0.2305064,0.0
B,0.0,0.0,0.0


In [18]:
tca.mdl$evals$metrics.list[[1]]$center.Z.corrs

Unnamed: 0,low entropy,high entropy,all
Gran,0.878991,0.4228022,0.8236695
CD8T,0.3843924,0.1583904,0.2643615
CD4T,0.6544522,0.175974,0.522266
Mono,0.7288146,0.3199096,0.657943
NK,0.3487941,0.1986003,0.2702522
B,0.4550442,0.1438142,0.2884919


In [19]:
tcax.mdl$evals$metrics.list[[1]]$center.Z.corrs

Unnamed: 0,low entropy,high entropy,all
Gran,0.9374126,0.4192938,0.861867
CD8T,0.713099,0.3554526,0.5139875
CD4T,0.8285181,0.313515,0.67773
Mono,0.8468365,0.3520747,0.7533118
NK,0.7296215,0.3411895,0.5111247
B,0.6286313,0.1533932,0.4117834


In [20]:
bMIND.mdl$evals$metrics.list[[1]]$center.Z.corrs

Unnamed: 0,low entropy,high entropy,all
Gran,0.9379698,0.42983863,0.8636908
CD8T,0.6835047,0.25142148,0.4541241
CD4T,0.8085409,0.21996693,0.6185613
Mono,0.8013404,0.3201776,0.6845437
NK,0.5800532,0.33416108,0.4533417
B,0.67885,0.06778337,0.3905664


### Root Median Square Error

In [21]:
base.mdl$evals$metrics.list[[1]]$center.Z.RMedS

Unnamed: 0,low entropy,high entropy,all
Gran,0.03441556,0.03355952,0.03415231
CD8T,0.05430856,0.06051313,0.05707592
CD4T,0.04746417,0.03592654,0.041513
Mono,0.05158839,0.0200702,0.03032501
NK,0.06128353,0.0900206,0.07408015
B,0.06124018,0.05733437,0.05927788


In [22]:
cibersortx.mdl$evals$metrics.list[[1]]$center.Z.RMedS

Unnamed: 0,low entropy,high entropy,all
Gran,0.02526324,0.01746002,0.02082772
CD8T,0.05637272,0.05277937,0.05493801
CD4T,0.0491014,0.03278678,0.03976024
Mono,0.05404574,0.01718198,0.02943462
NK,0.06393517,0.07554857,0.06969093
B,0.06216924,0.06592536,0.06371962


In [23]:
tca.mdl$evals$metrics.list[[1]]$center.Z.RMedS

Unnamed: 0,low entropy,high entropy,all
Gran,0.02593704,0.01691101,0.02110621
CD8T,0.05047761,0.05540494,0.0527173
CD4T,0.03625992,0.03352123,0.03486043
Mono,0.04875578,0.02057682,0.03059692
NK,0.05897169,0.07698768,0.06694547
B,0.05446998,0.0657897,0.05962963


In [24]:
tcax.mdl$evals$metrics.list[[1]]$center.Z.RMedS

Unnamed: 0,low entropy,high entropy,all
Gran,0.0236591,0.02353431,0.02362255
CD8T,0.03554823,0.05303885,0.04271923
CD4T,0.02855008,0.03181594,0.03006162
Mono,0.03203767,0.01775542,0.02363156
NK,0.04091652,0.07491251,0.05640449
B,0.04716521,0.06333654,0.05422811


In [25]:
bMIND.mdl$evals$metrics.list[[1]]$center.Z.RMedS

Unnamed: 0,low entropy,high entropy,all
Gran,0.02101778,0.02342611,0.02207205
CD8T,0.04528549,0.05350715,0.04881377
CD4T,0.03631643,0.03203074,0.03427631
Mono,0.04406761,0.01678675,0.02599103
NK,0.05629538,0.07618024,0.06546247
B,0.05491623,0.06402816,0.05944935


# Save

In [26]:
saveRDS(base.mdl,       file.path(res.dir, paste0("base.mdl.rds")))
saveRDS(cibersortx.mdl, file.path(res.dir, paste0("cibersortx.mdl.rds")))
saveRDS(tca.mdl,        file.path(res.dir, paste0("tca.mdl.rds")))
saveRDS(tcax.mdl,       file.path(res.dir, paste0("tcax.mdl.", pen.config, ".rds")))
saveRDS(bMIND.mdl,      file.path(res.dir, paste0("bMIND.mdl.rds")))