In [1]:
source("analysis.utils.r")
source("simulate.expression.utils.r")
set.seed(2023)

“package ‘MASS’ was built under R version 4.1.3”
Welcome to compositions, a package for compositional data analysis.
Find an intro with "? compositions"



Attaching package: ‘compositions’


The following objects are masked from ‘package:stats’:

    anova, cor, cov, dist, var


The following objects are masked from ‘package:base’:

    %*%, norm, scale, scale.default




# Parameters

In [2]:
# partial.W = F # T or F
# source.col = "decon.L1" # "decon.L1" or "decon.L2"
# k = 5
# feature.set = "HEF.10k" # "HVF.10k" or "HEF.10k"
# n = 500

# partial.W = F # T or F
# source.col = "decon.L2" # "decon.L1" or "decon.L2"
# k = 7
# feature.set = "HEF.10k" # "HVF.10k" or "HEF.10k"
# n = 500

# partial.W = F # T or F
# source.col = "decon.L1" # "decon.L1" or "decon.L2"
# k = 5
# feature.set = "HEF.10k" # "HVF.10k" or "HEF.10k"
# n = 250

partial.W = F # T or F
source.col = "decon.L1" # "decon.L1" or "decon.L2"
k = 5
feature.set = "HEF.10k" # "HVF.10k" or "HEF.10k"
n = 100

In [3]:
max_sds = 2
#max_sds = 3

m = 600;
fit.dirichlet = F;
add.noise = T; variable_thr = 10^(-4); fill_thr = 10^(-4); expression.qtl = 0; # dont do additional filtering
enrich.low = F; entropy.thr.ratio = 0; enrich.ratio = 0;
scale.max_sds = Inf; scale.factor.thr = 10**(-4)

In [4]:
n.copies = 20
res.dir = "../Data/RNA/Simulation-PBMC/"

## Directory

In [5]:
res.folder  = paste("sc-Stephenson", if (partial.W) "partial.W" else "all.W", source.col, feature.set, 
                    "k",k,"m",m,"n",n,
                    "dirichlet",substr(as.character(fit.dirichlet), 1,1),
                    "noiseZ",substr(as.character(add.noise), 1,1), "varThr",variable_thr, "filThr",fill_thr, "expQtl",expression.qtl,
                    "enrich",substr(as.character(enrich.low), 1,1), "etpRat",entropy.thr.ratio, "enrichRat",enrich.ratio, 
                    "maxSds",max_sds, "scale.maxSds",scale.max_sds, "scale.factor.thr",scale.factor.thr, sep =  "_")

res.folder  = file.path(res.dir,res.folder)
if (!file.exists(res.folder)){dir.create(file.path(res.folder),recursive = T)}

In [6]:
res.folder

# Load Data

In [7]:
pbmc.pb = readRDS(file.path(res.dir, 
                            paste0("pbmc.pseudobulk.",
                                    source.col, ".",
                                    if (partial.W) "partial.W." else "all.W.",
                                    "rds")))
Z.src = pbmc.pb$Z
W.src = pbmc.pb$W

In [8]:
length(intersect(pbmc.pb$HEF, pbmc.pb$HVF))

In [9]:
rowMeans(pbmc.pb$params$mus[head(pbmc.pb$HVF),])

In [10]:
rowMeans(pbmc.pb$params$mus[head(pbmc.pb$HEF),])

In [11]:
W.src

Unnamed: 0,CD8,CD4,Mono,B,NK
MH9179824,0.13006897,0.34496552,0.26606897,0.1342068966,0.12468966
newcastle65,0.25872300,0.49133634,0.07548066,0.0291953477,0.14526466
MH9143327,0.11556498,0.06833408,0.43501563,0.0576150067,0.32347030
MH9143326,0.10199488,0.05370844,0.67601023,0.0002046036,0.16808184
MH9143325,0.13439898,0.13273657,0.26700767,0.1448849105,0.32097187
MH9143320,0.05329670,0.05970696,0.21593407,0.1424908425,0.52857143
MH9143276,0.17005961,0.28646946,0.34349342,0.0678213924,0.13215611
MH9143274,0.17603834,0.47156550,0.26805112,0.0250798722,0.05926518
MH8919226,0.00000000,0.00000000,0.44941176,0.1364705882,0.41411765
MH9179821,0.07603041,0.42577031,0.07843137,0.0754301721,0.34433774


In [12]:
if (feature.set == "HVF.10k"){
    Z.src = Z.src[,pbmc.pb$HVF,]
}else{
    Z.src = Z.src[,pbmc.pb$HEF,]
}

In [13]:
head(dimnames(Z.src)[[2]])

# Simulate Data

In [14]:
sim.data.list = list()
for (t in 1:n.copies){
    sim.data = simulate_expression_mixture(Z.src = Z.src, W.src = W.src, 
                                           k = k, m = m, n = n,
                                           fit.dirichlet = fit.dirichlet, 
                                           add.noise = add.noise, variable_thr = variable_thr, fill_thr = fill_thr, expression.qtl = expression.qtl, 
                                           enrich.low = enrich.low, entropy.thr.ratio = entropy.thr.ratio, enrich.ratio = enrich.ratio, 
                                           max_sds = max_sds, scale.max_sds = scale.max_sds, scale.factor.thr = scale.factor.thr, seed.num = t)
    
    # save files for cibersortx to read
    fwrite(as.data.frame(sim.data$X),   
           file = file.path(res.folder,paste0("X.txt.", t)), 
           sep = "\t", quote=FALSE, row.names = T, col.names = T) 

    fwrite(as.data.frame(sim.data$W),   
           file = file.path(res.folder,paste0("W.txt.", t)), 
           sep = "\t", quote=FALSE, row.names = T, col.names = T) 
    
    # add to list
    sim.data.list[[t]] = sim.data
}
saveRDS(sim.data.list, file.path(res.folder, "sim.data.list.rds"))

[1] "rename feature.ids to replace - or / with . "
[1] "simulate W"
[1] "sampling in the more aboundant celltypes (>5%)"
[1] "n is smaller than the number of individuals in W"
[1] "draw without replacement"
[1] "extract sources selected in W, only kept those in Z"
[1] "simulate Z"
[1] "extracting top quantile :0 features, based on sum of expression in all sources, all samples"
[1] "sampling 600 random genes no enrichment on low entropy"
[1] "n is smaller than the number of individuals in Z"
[1] "draw without replacement"
[1] "generate X"
[1] "recording parameters used for reproducibility"
[1] "put everything together"
[1] "normalization"
[1] "rename feature.ids to replace - or / with . "
[1] "simulate W"
[1] "sampling in the more aboundant celltypes (>5%)"
[1] "n is smaller than the number of individuals in W"
[1] "draw without replacement"
[1] "extract sources selected in W, only kept those in Z"
[1] "simulate Z"
[1] "extracting top quantile :0 features, based on sum of expression in 

In [15]:
sim.data$variable.feature.source

Unnamed: 0,CD4,NK,CD8,Mono,B
SLC35A5,TRUE,TRUE,TRUE,TRUE,TRUE
NKTR,TRUE,TRUE,TRUE,TRUE,TRUE
CLPP,TRUE,TRUE,TRUE,TRUE,TRUE
PLSCR3,TRUE,TRUE,TRUE,TRUE,TRUE
EIF2B3,TRUE,TRUE,TRUE,TRUE,TRUE
CHFR,TRUE,TRUE,TRUE,TRUE,TRUE
RUVBL1,TRUE,TRUE,TRUE,TRUE,TRUE
GSKIP,TRUE,TRUE,TRUE,TRUE,TRUE
COX8A,TRUE,TRUE,TRUE,TRUE,TRUE
RPL29,TRUE,TRUE,TRUE,TRUE,TRUE
