In [1]:
library("data.table")
library("matrixStats")# colMeans

source("analysis.utils.r")
source("simulate.expression.utils.r")

set.seed(2023)

“package ‘MASS’ was built under R version 4.1.3”
Welcome to compositions, a package for compositional data analysis.
Find an intro with "? compositions"



Attaching package: ‘compositions’


The following objects are masked from ‘package:stats’:

    anova, cor, cov, dist, var


The following objects are masked from ‘package:base’:

    %*%, norm, scale, scale.default




In [2]:
#source.col = "decon.L1"
#n = 500

#source.col = "decon.L2"
#n = 500

# source.col = "decon.L1"
# n = 250

source.col = "decon.L1"
n = 100

In [3]:
max_stds = 2
project.dir = "/u/home/j/johnsonc/project-halperin/TCAx/TCAx2023/"

#relative path of the data dir to project.dir
if(source.col == "decon.L1"){
    data.dir = paste0("Data/RNA/Simulation-Lung/sc-HLCA_all.W_decon.L1_HEF.10k_k_4_m_600_n_", n, "_dirichlet_F_noiseZ_T_varThr_1e-04_filThr_1e-04_expQtl_0_enrich_F_etpRat_0_enrichRat_0_maxSds_", max_stds, "_scale.maxSds_Inf_scale.factor.thr_1e-04/")
}else{
    data.dir = paste0("Data/RNA/Simulation-Lung/sc-HLCA_all.W_decon.L2_HEF.10k_k_6_m_600_n_", n, "_dirichlet_F_noiseZ_T_varThr_1e-04_filThr_1e-04_expQtl_0_enrich_F_etpRat_0_enrichRat_0_maxSds_", max_stds, "_scale.maxSds_Inf_scale.factor.thr_1e-04/")
}


ts = 1:20
robust = T
qtl = 0.95

In [4]:
data.dir = file.path(project.dir, data.dir)
data.name = strsplit(data.dir, "/")[[1]]
res.dir = file.path(project.dir, 
                    paste(c("Result", data.name[(length(data.name)-2) : length(data.name)]), collapse = "/"))


if (!file.exists(res.dir)){dir.create(file.path(res.dir),recursive = T)}
print(project.dir)
print(data.dir)
print(res.dir)

[1] "/u/home/j/johnsonc/project-halperin/TCAx/TCAx2023/"
[1] "/u/home/j/johnsonc/project-halperin/TCAx/TCAx2023//Data/RNA/Simulation-Lung/sc-HLCA_all.W_decon.L1_HEF.10k_k_4_m_600_n_100_dirichlet_F_noiseZ_T_varThr_1e-04_filThr_1e-04_expQtl_0_enrich_F_etpRat_0_enrichRat_0_maxSds_2_scale.maxSds_Inf_scale.factor.thr_1e-04/"
[1] "/u/home/j/johnsonc/project-halperin/TCAx/TCAx2023//Result/RNA/Simulation-Lung/sc-HLCA_all.W_decon.L1_HEF.10k_k_4_m_600_n_100_dirichlet_F_noiseZ_T_varThr_1e-04_filThr_1e-04_expQtl_0_enrich_F_etpRat_0_enrichRat_0_maxSds_2_scale.maxSds_Inf_scale.factor.thr_1e-04"


# Read simulation data

In [5]:
sim.data.list = readRDS(file.path(data.dir, "sim.data.list.rds"))

In [6]:
str(sim.data.list)

List of 20
 $ :List of 13
  ..$ source.ids             : chr [1:4] "Immune" "Epithelial" "Endothelial" "Stroma"
  ..$ feature.ids            : chr [1:600] "ENSG00000184990" "ENSG00000148053" "ENSG00000204713" "ENSG00000157077" ...
  ..$ sample.ids             : chr [1:100] "sample.1" "sample.2" "sample.3" "sample.4" ...
  ..$ W                      : num [1:100, 1:4] 0.59568 0.23242 0.00899 0.67114 0.29463 ...
  .. ..- attr(*, "dimnames")=List of 2
  .. .. ..$ : chr [1:100] "sample.1" "sample.2" "sample.3" "sample.4" ...
  .. .. ..$ : chr [1:4] "Immune" "Epithelial" "Endothelial" "Stroma"
  ..$ Z                      : num [1:4, 1:600, 1:100] 41.3 24 56.8 0 0 ...
  .. ..- attr(*, "dimnames")=List of 3
  .. .. ..$ : chr [1:4] "Immune" "Epithelial" "Endothelial" "Stroma"
  .. .. ..$ : chr [1:600] "ENSG00000184990" "ENSG00000148053" "ENSG00000204713" "ENSG00000157077" ...
  .. .. ..$ : chr [1:100] "sample.1" "sample.2" "sample.3" "sample.4" ...
  ..$ X                      : num [1:600, 1

# Load CIBSERSORTx

In [7]:
cibersortx.mdl.list = list()
for (t in ts){
    sim.data = sim.data.list[[t]]
    cibersortx.mdl = list()
    
    print("read in cibersortx's estimated Z")
    cibersortx.mdl$Z.hat.orig = copy(sim.data.list[[t]]$Z) 
    cibersortx.mdl$Z.hat.orig[,,] = 0
    
    for (source.id in dimnames(cibersortx.mdl$Z.hat.orig)[[1]]){
        Z.hat.file = file.path(res.dir, paste0("CIBERSORTxHiRes_NA_", gsub("\\ ", "", source.id) , "_Window",round(4 * ncol(sim.data$W)), ".txt.", t))
        cibersortx.mdl$Z.hat.orig[source.id,,] = as.matrix(data.frame(fread(Z.hat.file), row.names=1))
    }

    #recon evalulation scale 
    cibersortx.mdl$Z.hat.eval = scale_source_feature_sample(cibersortx.mdl$Z.hat.orig, 1/sim.data$feature.scale.factor)
    cibersortx.mdl$params.recon.eval = calc_params_from_Z(cibersortx.mdl$Z.hat.eval, max_sds = max_stds)

    cibersortx.mdl.list[[t]] = cibersortx.mdl
}

saveRDS(cibersortx.mdl.list,  file.path(res.dir, paste0("cibersortx.mdl.list.rds")))

[1] "read in cibersortx's estimated Z"
[1] "read in cibersortx's estimated Z"
[1] "read in cibersortx's estimated Z"
[1] "read in cibersortx's estimated Z"
[1] "read in cibersortx's estimated Z"
[1] "read in cibersortx's estimated Z"
[1] "read in cibersortx's estimated Z"
[1] "read in cibersortx's estimated Z"
[1] "read in cibersortx's estimated Z"
[1] "read in cibersortx's estimated Z"
[1] "read in cibersortx's estimated Z"
[1] "read in cibersortx's estimated Z"
[1] "read in cibersortx's estimated Z"
[1] "read in cibersortx's estimated Z"
[1] "read in cibersortx's estimated Z"
[1] "read in cibersortx's estimated Z"
[1] "read in cibersortx's estimated Z"
[1] "read in cibersortx's estimated Z"
[1] "read in cibersortx's estimated Z"
[1] "read in cibersortx's estimated Z"


# Eval CIBERSORTx

In [8]:
for (t in ts){
    #recon
    cibersortx.mdl.list[[t]]$moment.recon.corrs = get_moment_corrs(params.true = sim.data.list[[t]]$ params.scale,
                                                                   params.hat = cibersortx.mdl.list[[t]]$params.recon.eval, robust, qtl)
}

In [9]:
cibersortx.mdl.list[[1]]$moment.recon.corrs

Unnamed: 0,Immune,Epithelial,Endothelial,Stroma
mus.rob.corrs,0.9665854,0.9217231,0.6897649,0.6651073

Unnamed: 0,Immune,Epithelial,Endothelial,Stroma
var.rob.corrs,0.8027022,0.6233212,0,0

Unnamed: 0,Immune-Epithelial,Immune-Endothelial,Immune-Stroma,Epithelial-Endothelial,Epithelial-Stroma,Endothelial-Stroma
covar.rob.corrs,0.4645894,0,0,0,0,0


In [10]:
start = Sys.time()

# first adding eval.features.source, 
# we dont want to compare correlation against gene-celltype that is not really active and we injected noise
#for (t in ts){
for (t in ts){
    Z.scale = sim.data.list[[t]]$Z.scale
    eval.feature.source = calc_variable_feature_source(Z = Z.scale, variable_thr = 0.1, max_sds = max_stds)
    sim.data.list[[t]]$eval.feature.source = eval.feature.source
    
    #all the gene-celltype that has inject noise are not entering the evaluation
    #first explicitly set this
    #edge case: the original gene is not expressed in any celltypes (after outliers removed), so all celltypes are noise
    #after scaling. the injeted noise passed the eval.feature.source. 
    sim.data.list[[t]]$eval.feature.source[!sim.data.list[[t]]$variable.feature.source] = FALSE
    assert(!any(sim.data.list[[t]]$eval.feature.source[!sim.data.list[[t]]$variable.feature.source]))
    
    
    cibersortx.mdl.list[[t]]$Z.corrs = calc_Z_corrs(Z.true = Z.scale, 
                                                    Z.hat = cibersortx.mdl.list[[t]]$Z.hat.eval, 
                                                    eval.feature.source = eval.feature.source,
                                                    robust = robust, qtl = qtl)
    
}

end = Sys.time()
print(end - start)


Time difference of 2.348415 mins


In [11]:
str(cibersortx.mdl.list)

List of 20
 $ :List of 5
  ..$ Z.hat.orig        : num [1:4, 1:600, 1:100] 62.1 129.9 93.6 120.3 1 ...
  .. ..- attr(*, "dimnames")=List of 3
  .. .. ..$ : chr [1:4] "Immune" "Epithelial" "Endothelial" "Stroma"
  .. .. ..$ : chr [1:600] "ENSG00000184990" "ENSG00000148053" "ENSG00000204713" "ENSG00000157077" ...
  .. .. ..$ : chr [1:100] "sample.1" "sample.2" "sample.3" "sample.4" ...
  ..$ Z.hat.eval        : num [1:4, 1:600, 1:100] 1.72 3.598 2.593 3.333 0.102 ...
  .. ..- attr(*, "dimnames")=List of 3
  .. .. ..$ : chr [1:4] "Immune" "Epithelial" "Endothelial" "Stroma"
  .. .. ..$ : chr [1:600] "ENSG00000184990" "ENSG00000148053" "ENSG00000204713" "ENSG00000157077" ...
  .. .. ..$ : chr [1:100] "sample.1" "sample.2" "sample.3" "sample.4" ...
  ..$ params.recon.eval :List of 4
  .. ..$ sigmas   : num [1:600, 1:4, 1:4] 0.661 0 0.167 0 0.105 ...
  .. .. ..- attr(*, "dimnames")=List of 3
  .. .. .. ..$ : chr [1:600] "ENSG00000184990" "ENSG00000148053" "ENSG00000204713" "ENSG00000157077" 

In [12]:
str(cibersortx.mdl.list[[1]])

List of 5
 $ Z.hat.orig        : num [1:4, 1:600, 1:100] 62.1 129.9 93.6 120.3 1 ...
  ..- attr(*, "dimnames")=List of 3
  .. ..$ : chr [1:4] "Immune" "Epithelial" "Endothelial" "Stroma"
  .. ..$ : chr [1:600] "ENSG00000184990" "ENSG00000148053" "ENSG00000204713" "ENSG00000157077" ...
  .. ..$ : chr [1:100] "sample.1" "sample.2" "sample.3" "sample.4" ...
 $ Z.hat.eval        : num [1:4, 1:600, 1:100] 1.72 3.598 2.593 3.333 0.102 ...
  ..- attr(*, "dimnames")=List of 3
  .. ..$ : chr [1:4] "Immune" "Epithelial" "Endothelial" "Stroma"
  .. ..$ : chr [1:600] "ENSG00000184990" "ENSG00000148053" "ENSG00000204713" "ENSG00000157077" ...
  .. ..$ : chr [1:100] "sample.1" "sample.2" "sample.3" "sample.4" ...
 $ params.recon.eval :List of 4
  ..$ sigmas   : num [1:600, 1:4, 1:4] 0.661 0 0.167 0 0.105 ...
  .. ..- attr(*, "dimnames")=List of 3
  .. .. ..$ : chr [1:600] "ENSG00000184990" "ENSG00000148053" "ENSG00000204713" "ENSG00000157077" ...
  .. .. ..$ : chr [1:4] "Immune" "Epithelial" "Endoth

In [13]:
cibersortx.mdl.list[[1]]$ Z.corrs

Unnamed: 0,Immune,Epithelial,Endothelial,Stroma
ENSG00000184990,0.7692942,0.0000000,-0.3584006,0.000000000
ENSG00000148053,,,0.5543945,0.000000000
ENSG00000204713,0.8113671,0.5873351,0.2085022,0.000000000
ENSG00000157077,,0.7226809,0.3290675,-0.008554994
ENSG00000007341,0.8091364,0.5818500,0.0000000,0.307925382
ENSG00000178585,0.5214077,0.6492574,0.0000000,0.070237837
ENSG00000147383,,0.6843271,0.0000000,0.000000000
ENSG00000136930,0.8941779,0.0000000,0.0000000,0.000000000
ENSG00000139631,0.9203438,0.8759252,0.0000000,0.000000000
ENSG00000153446,,0.5480260,,0.000000000


# Save

In [14]:
saveRDS(cibersortx.mdl.list, file.path(res.dir, paste0("cibersortx.mdl.list.rds")))