In [19]:
library(edgeR)
library(sva)
source('./ABCA7lof2//degs.r')

In [20]:
all_data = readRDS('./processed_data/single_cell/stats_input_data.rds')

In [21]:
summed_counts = all_data$summed_counts_by_ind # does it make sense to be summing on the counts vs taking the mean of the counts? --> compare to nebula? --> ok because we are normalizing?
meta = all_data$summary
meta$seq_batch = ifelse(meta$seq_batch=='JBM',1,0)
expressed = all_data$expressed10

In [22]:
# filter out individuals with too few cells per celltype
keep = summed_counts$ncells>20
summed_counts_indexed = summed_counts$summed_counts[,keep]

In [23]:
# compute degs (all samples)
vars = c('LOF', 'amyloid', 'nft',  'msex', 'age_death', 'pmi', 'seq_batch', 'APOE4')
limma_inputs = get_limma_inputs(summed_counts_indexed, expressed, meta, vars)
aggs = limma_inputs$aggs
metadata = limma_inputs$metadata


degs_all = lapply(names(aggs)[!names(aggs)%in%c('Vascular')], function(x) RunDiffExprAnalysisLimma(aggs[[x]], metadata[[x]], n.sv=NULL))
names(degs_all) = names(aggs)[!names(aggs)%in%c('Vascular')]

Number of significant surrogate variables is:  9 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  8 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  8 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  8 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  9 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  7 
Iteration (out of 5 ):1  2  3  4  5  

In [24]:
# get the ids
ids = strsplit(colnames(summed_counts_indexed), '[.]')
ids = unlist(lapply(1:length(ids), function(x) ids[[x]][2]))

In [25]:
# compute degs (no APOE4 samples)
vars = c('LOF', 'amyloid', 'nft',  'msex', 'age_death', 'pmi', 'seq_batch')
sele = rownames(meta)[meta$APOE4==0]
sc_noAPOE4 = summed_counts_indexed[,ids%in%sele]
limma_inputs = get_limma_inputs(sc_noAPOE4, expressed, meta[sele,], vars)
aggs = limma_inputs$aggs
metadata = limma_inputs$metadata
degs_noAPOE4 = lapply(names(aggs)[!names(aggs)%in%c('Vascular')], function(x) RunDiffExprAnalysisLimma(aggs[[x]], metadata[[x]], n.sv=NULL, exclude_apoe=TRUE))
names(degs_noAPOE4) = names(aggs)[!names(aggs)%in%c('Vascular')]


Number of significant surrogate variables is:  6 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  5 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  6 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  5 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  4 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  3 
Iteration (out of 5 ):1  2  3  4  5  

In [26]:
# compute degs (no APOE4 samples)
vars = c('LOF', 'amyloid', 'nft',  'msex', 'age_death', 'pmi', 'APOE4')
sele = rownames(meta)[meta$seq_batch==1]
sc_JBM = summed_counts_indexed[,ids%in%sele]
limma_inputs = get_limma_inputs(sc_JBM, expressed, meta[sele,], vars)
aggs = limma_inputs$aggs
metadata = limma_inputs$metadata
degs_JBM = lapply(names(aggs)[!names(aggs)%in%c('Vascular')], function(x) RunDiffExprAnalysisLimma(aggs[[x]], metadata[[x]], n.sv=NULL, exclude_batch=TRUE))
names(degs_JBM) = names(aggs)[!names(aggs)%in%c('Vascular')]

Number of significant surrogate variables is:  4 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  4 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  4 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  4 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  4 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  4 
Iteration (out of 5 ):1  2  3  4  5  

In [27]:
# exclude both
vars = c('LOF', 'amyloid', 'nft',  'msex', 'age_death', 'pmi')
sele = rownames(meta)[(meta$seq_batch==1) & (meta$APOE4==0)]
sc_exclude_both = summed_counts_indexed[,ids%in%sele]
limma_inputs = get_limma_inputs(sc_exclude_both, expressed, meta[sele,], vars)
aggs = limma_inputs$aggs
metadata = limma_inputs$metadata
degs_exclude_both = lapply(names(aggs)[!names(aggs)%in%c('Vascular')], function(x) RunDiffExprAnalysisLimma(aggs[[x]], metadata[[x]], n.sv=NULL, exclude_both=TRUE))
names(degs_exclude_both) = names(aggs)[!names(aggs)%in%c('Vascular')]

Number of significant surrogate variables is:  4 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  3 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  3 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  3 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  4 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  

In [28]:
# save the results
degs = list()
degs[['degs_all']] = degs_all
degs[['degs_exclude_both']] = degs_exclude_both
degs[['degs_JBM']] = degs_JBM
degs[['degs_noAPOE4']] = degs_noAPOE4

saveRDS(degs, './processed_data/single_cell/pseudobulk_degs.rds')