In [34]:
# This notebook takes 
# res.dir: the path to the directory that have the seu.rds of the pbmc single cell data
# and construct the raw celltype level psudobulk material needed for simluation 

# if partial.W is T
# it filter out samples with low cell counts or imbalance cell type prop
# and construct the raw celltype level psudobulk material needed for simluation 
# W: remove samples with lower than 1000 cells (around 10 percent)
#    remove samples with any celltype less than 1%

# if partial.W is F
# W: use all samples' W 
# Z: calculated as the AverageExpression on the CPM. (average is taken in the unloged space)
# params: $mus, $sigmas, $corrs: calculated based on Z with samples outside max_sds = 3 removed 
# entropy: calculated based on params$corrs

#save the final object with W, Z, params, entorpy as list element as pbmc.pseudobulk.rds in the res.dir

In [45]:
library("Seurat")
library("pracma")
library("ggplot2")
library("ggpubr")
source("analysis.utils.r")
source("simulate.expression.utils.r")
set.seed(2023)

In [46]:
partial.W = F

#source.col  = "decon.L1"
source.col  = "decon.L2"
sample.col = "sample_id"

res.dir = "../Data/RNA/Simulation-PBMC"

# Load Data

In [37]:
stephenson  = readRDS(file.path(res.dir, "stephenson.clean.seu"))

# make sure that using "raw" counts and does precisely CPM
DefaultAssay(stephenson) <- "raw"
stephenson <- NormalizeData(stephenson, normalization.method = "LogNormalize", scale.factor = 1e6)

In [38]:
stephenson

An object of class Seurat 
45612 features across 499336 samples within 2 assays 
Active assay: raw (22806 features, 2000 variable features)
 1 other assay present: RNA
 5 dimensional reductions calculated: pca, pca_harmony, umap, decon_pca, decon_umap

# Generate W

In [47]:
sample.ids = unique(stephenson@meta.data[,sample.col])
source.ids = unique(stephenson@meta.data[,source.col])

print(paste0("total number of sample: ", length(sample.ids)))
print(paste0("total number of source: ", length(source.ids)))

[1] "total number of sample: 118"
[1] "total number of source: 7"


In [48]:
W.counts = matrix(0, length(sample.ids), length(source.ids))
rownames(W.counts) = sample.ids
colnames(W.counts) = source.ids

for (sample.id in sample.ids){
    for (source.id in source.ids){
        W.counts[sample.id,source.id] = sum((stephenson@meta.data[,sample.col] == sample.id) & (stephenson@meta.data[,source.col] == source.id))
    }
}

W.prop = W.counts/repmat(as.matrix(rowSums(W.counts)), 1, ncol(W.counts))

# Filter W on sample (low cell counts and no proportion in one celltype)

In [49]:
if (partial.W){
    #samples with at least 1000 cells (will remove 10% of the sample)
    keep.samples.counts = rowSums(W.counts) > 1000
    #samples with at least 1% cell proportion in all celltypes 
    keep.samples.prop = rowSums(W.prop < 0.01) == 0 

    #filter W.prop
    keep.samples = keep.samples.counts & keep.samples.prop
    W.counts = W.counts[keep.samples, ]
    W.prop   = W.prop[keep.samples, ]

    #update sample.ids, source.ids
    W.prop = W.prop[, order(-colMeans(W.prop))]
    sample.ids  = rownames(W.prop)
    source.ids  = colnames(W.prop)
}
#set W to filtered W.prop
W = W.prop

In [50]:
W.counts

Unnamed: 0,CD8,CD4,Mono.CD14,B,NK,Mono.CD16,Plasma
MH9179824,943,2501,1730,916,904,199,57
newcastle65,1090,2070,284,119,612,34,4
MH9143327,1035,612,3510,457,2897,386,59
MH9143326,997,525,5384,0,1643,1224,2
MH9143325,1051,1038,1957,1039,2510,131,94
MH9143320,291,326,1135,646,2886,44,132
MH9143276,1512,2547,2578,542,1175,476,61
MH9143274,1102,2952,1544,142,371,134,15
MH8919226,0,0,162,58,176,29,0
MH9179821,380,2128,361,351,1721,31,26


In [51]:
W

Unnamed: 0,CD8,CD4,Mono.CD14,B,NK,Mono.CD16,Plasma
MH9179824,0.13006897,0.34496552,0.23862069,0.12634483,0.12468966,0.027448276,0.0078620690
newcastle65,0.25872300,0.49133634,0.06741040,0.02824591,0.14526466,0.008070259,0.0009494422
MH9143327,0.11556498,0.06833408,0.39191603,0.05102724,0.32347030,0.043099598,0.0065877624
MH9143326,0.10199488,0.05370844,0.55079284,0.00000000,0.16808184,0.125217391,0.0002046036
MH9143325,0.13439898,0.13273657,0.25025575,0.13286445,0.32097187,0.016751918,0.0120204604
MH9143320,0.05329670,0.05970696,0.20787546,0.11831502,0.52857143,0.008058608,0.0241758242
MH9143276,0.17005961,0.28646946,0.28995614,0.06096052,0.13215611,0.053537285,0.0068608705
MH9143274,0.17603834,0.47156550,0.24664537,0.02268371,0.05926518,0.021405751,0.0023961661
MH8919226,0.00000000,0.00000000,0.38117647,0.13647059,0.41411765,0.068235294,0.0000000000
MH9179821,0.07603041,0.42577031,0.07222889,0.07022809,0.34433774,0.006202481,0.0052020808


# Generate Z (verified it is consistent with old result)

In [None]:
avg_counts = AverageExpression(object = stephenson, 
                               group.by  = c(source.col,sample.col), 
                               assays = "raw",
                               slot = "data") # if slot is "data", the average is computed in the original "un-log" space

In [None]:
feature.ids = rownames(stephenson)
Z = array(0,c(length(source.ids), length(feature.ids), length(sample.ids)))
dimnames(Z)[[1]] = source.ids
dimnames(Z)[[2]] = feature.ids
dimnames(Z)[[3]] = sample.ids

for (source.id in source.ids){
    for (sample.id in sample.ids){
        Z[source.id,,sample.id] = tryCatch(
            {
                avg_counts$raw[, paste(source.id, sample.id, sep = "_")]
            },
            error=function(cond) {
                print(paste("no cells for", source.id, sample.id))
                0
            })
    }
}

In [None]:
Z[1, 1:10, 1:10]

In [None]:
# #manually do that 
# gen.pseudobulk = function(seu.meta, seu.counts, sample.group.by, source.group.by){
    
#     source.ids  = as.character(unique(seu.meta[[source.group.by]]))
#     sample.ids  = as.character(unique(seu.meta[[sample.group.by]]))
#     feature.ids = as.character(rownames(seu.counts))

#     k = length(source.ids)
#     m = length(feature.ids)
#     n = length(sample.ids)

#     W = matrix(0, n, k)
#     Z = array(0,c(k, m, n))
#     for (i in 1:n){
#         sample.id = sample.ids[i]        
#         for (h in 1:k){
#             source.id  = source.ids[h]
#             cells.use = rownames(seu.meta[(seu.meta[sample.group.by] == sample.id & seu.meta[source.group.by] == source.id),])
            
#             # if more than one cell, take the average 
#             if(length(cells.use) >= 2){
#                 Z[h,,i] = as.matrix(rowMeans(seu.counts[feature.ids,cells.use]))
#                 W[i,h]  = length(cells.use)
#             # if only one cell, use it 
#             }else if (length(cells.use) == 1) {
#                 Z[h,,i] = as.matrix(seu.counts[feature.ids,cells.use])
#                 W[i,h]  = 1
#             # else just put all 0s
#             }else{
#                 Z[h,,i] = 0
#                 W[i,h]  = 0
#             }
#         }# end of source
#     }# end of sample
    
#     # process W and reorder things
#     W = W/repmat(as.matrix(rowSums(W)), 1, dim(W)[2])
    
#     reorder.index = order(-colMeans(W))
#     source.ids = source.ids[reorder.index]
#     W = W[,reorder.index]
#     rownames(W) = sample.ids
#     colnames(W) = source.ids
    
#     Z = Z[reorder.index,,]
#     dimnames(Z)[[1]] = source.ids
#     dimnames(Z)[[2]] = feature.ids
#     dimnames(Z)[[3]] = sample.ids
    
#     # generate X
#     X = matrix(0, m, n)
#     rownames(X) = feature.ids
#     colnames(X) = sample.ids
#     for (h in 1:k){
#         X = X + Z[h,,] * repmat(t(W[,h]), m ,1)
#     }
    
#     res = list()
#     res$Z = Z
#     res$W = W
#     res$X = X
#     res$source.ids  = source.ids
#     res$feature.ids = feature.ids
#     res$sample.ids  = sample.ids
#     return(res)
# }

# stephenson  = NormalizeData(object = stephenson, 
#                             normalization.method = "RC",
#                             scale.factor = 1e6)
# pseudobulk.cpm = gen.pseudobulk(seu.meta    = stephenson@meta.data,
#                                 seu.counts  = stephenson@assays$raw@data,
#                                 sample.group.by = "sample_id",
#                                 source.group.by = "decon.L1")

# pseudobulk.cpm$Z["CD4", 1:10, 1:10]

# pseudobulk.cpm$X[1:10, 1:10]

# DefaultAssay(stephenson) <- "raw"
# stephenson <- NormalizeData(stephenson, normalization.method = "LogNormalize", scale.factor = 1e6)

In [None]:
# # Generate X

# X = matrix(0, length(feature.ids), length(sample.ids))
# rownames(X) = feature.ids
# colnames(X) = sample.ids
# for (h in 1:length(source.ids)){
#     X = X + Z[h,,] * repmat(t(W[,h]), length(feature.ids), 1)
# }

# X[1:10, 1:10] 

# Check Entropy

In [None]:
params    = calc_params_from_Z(Z, max_sds = 3)
entropy   = calc_entropy(params$corrs)

In [None]:
entropy

In [None]:
hist(entropy, main = "All genes entropy")

In [None]:
stephenson = FindVariableFeatures(stephenson, nfeatures = 10000)

In [None]:
hist(entropy[VariableFeatures(stephenson), ], main = "Top 10k HVF entropy")

# Save

In [None]:
pbmc.pseudobulk = list(Z = Z, W = W, 
                       params = params, entropy = entropy, 
                       HVF = VariableFeatures(stephenson),
                       HEF = highly_expressed_features(Z = Z, expression.qtl = 0)[1:10000])
saveRDS(pbmc.pseudobulk,  
        file.path(res.dir, paste0("pbmc.pseudobulk.", 
                                  source.col, ".",
                                  if (partial.W) "partial.W." else "all.W.",
                                  "rds")))

In [None]:
str(pbmc.pseudobulk)

# check low and high entropy example

In [None]:
low.entropy.thr  = quantile(entropy, 0.1)
high.entropy.thr = quantile(entropy, 0.9)

In [None]:
low.entropy.thr

In [None]:
low_entropy_gene  = rownames(pbmc.pseudobulk$entropy)[sample(which (pbmc.pseudobulk$entropy <= low.entropy.thr), 1)]
high_entropy_gene = rownames(pbmc.pseudobulk$entropy)[sample(which (pbmc.pseudobulk$entropy >= high.entropy.thr), 1)]

In [None]:
pbmc.pseudobulk$params$mus[low_entropy_gene,]

In [None]:
pbmc.pseudobulk$params$sigmas[low_entropy_gene,,]

In [None]:
pbmc.pseudobulk$params$corrs[low_entropy_gene,,]

In [None]:
pbmc.pseudobulk$params$mus[high_entropy_gene,]

In [None]:
pbmc.pseudobulk$params$sigmas[high_entropy_gene,,]

In [None]:
pbmc.pseudobulk$params$corrs[high_entropy_gene,,]

In [None]:
plot_example_genes = function(pseudobulk, gene.names){
    plots = list()
     
    counter = 1
    for (gene in gene.names){
        cor_mat = pseudobulk$params$corrs[gene,,]
        
        #most correlated 2 sources
        idx = which(cor_mat == max(cor_mat[lower.tri(cor_mat,diag = F)]), arr.ind = TRUE)[1,]
        title = paste(gene, "source", idx[1], "vs", idx[2])
        
        plot_df = data.frame(t(pseudobulk$Z[idx,gene,]))
        colnames(plot_df) = c("source.1" , "source.2")
        
        
        plots[[counter]] = ggplot(plot_df, aes(x = log10(1 + source.1), y = log10(1 + source.2))) + 
                           geom_point(alpha = 0.5, size = 1) + 
                           ggtitle(title) + theme(plot.title = element_text(hjust = 0.5)) + theme(aspect.ratio = 1)
        
        plots[[counter + 1]] = ggplot(plot_df, aes(x = source.1, y = source.2)) + 
                               geom_point(alpha = 0.5, size = 1) + 
                               ggtitle(title) + theme(plot.title = element_text(hjust = 0.5)) + theme(aspect.ratio = 1)
        counter = counter + 2
        
    }
    
    g <- egg::ggarrange(plots = plots, ncol = 2)  
    return(g)
}

In [None]:
g = plot_example_genes(pbmc.pseudobulk, c(low_entropy_gene, high_entropy_gene))