In [1]:
library(edgeR)
library(sva)
library(fgsea)
library(tidyr)
source('./ABCA7lof2//degs.r')

Loading required package: limma

Loading required package: mgcv

Loading required package: nlme

This is mgcv 1.8-40. For overview type 'help("mgcv-package")'.

Loading required package: genefilter

Loading required package: BiocParallel



In [2]:
all_data = readRDS('./processed_data/single_cell/stats_input_data.rds')

In [21]:
summed_counts = all_data$summed_counts_by_ind # does it make sense to be summing on the counts vs taking the mean of the counts? --> compare to nebula? --> ok because we are normalizing?
meta = all_data$summary
meta$seq_batch = ifelse(meta$seq_batch=='JBM',1,0)
expressed = all_data$expressed10

In [22]:
# filter out individuals with too few cells per celltype
keep = summed_counts$ncells>20
summed_counts_indexed = summed_counts$summed_counts[,keep]

In [23]:
# compute degs (all samples)
vars = c('LOF', 'amyloid', 'nft',  'msex', 'age_death', 'pmi', 'seq_batch', 'APOE4')
limma_inputs = get_limma_inputs(summed_counts_indexed, expressed, meta, vars)
aggs = limma_inputs$aggs
metadata = limma_inputs$metadata


degs_all = lapply(names(aggs)[!names(aggs)%in%c('Vascular')], function(x) RunDiffExprAnalysisLimma(aggs[[x]], metadata[[x]], n.sv=NULL))
names(degs_all) = names(aggs)[!names(aggs)%in%c('Vascular')]

Number of significant surrogate variables is:  9 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  8 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  8 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  8 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  9 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  7 
Iteration (out of 5 ):1  2  3  4  5  

In [24]:
# get the ids
ids = strsplit(colnames(summed_counts_indexed), '[.]')
ids = unlist(lapply(1:length(ids), function(x) ids[[x]][2]))

In [25]:
# compute degs (no APOE4 samples)
vars = c('LOF', 'amyloid', 'nft',  'msex', 'age_death', 'pmi', 'seq_batch')
sele = rownames(meta)[meta$APOE4==0]
sc_noAPOE4 = summed_counts_indexed[,ids%in%sele]
limma_inputs = get_limma_inputs(sc_noAPOE4, expressed, meta[sele,], vars)
aggs = limma_inputs$aggs
metadata = limma_inputs$metadata
degs_noAPOE4 = lapply(names(aggs)[!names(aggs)%in%c('Vascular')], function(x) RunDiffExprAnalysisLimma(aggs[[x]], metadata[[x]], n.sv=NULL, exclude_apoe=TRUE))
names(degs_noAPOE4) = names(aggs)[!names(aggs)%in%c('Vascular')]


Number of significant surrogate variables is:  6 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  5 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  6 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  5 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  4 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  3 
Iteration (out of 5 ):1  2  3  4  5  

In [26]:
# compute degs (no APOE4 samples)
vars = c('LOF', 'amyloid', 'nft',  'msex', 'age_death', 'pmi', 'APOE4')
sele = rownames(meta)[meta$seq_batch==1]
sc_JBM = summed_counts_indexed[,ids%in%sele]
limma_inputs = get_limma_inputs(sc_JBM, expressed, meta[sele,], vars)
aggs = limma_inputs$aggs
metadata = limma_inputs$metadata
degs_JBM = lapply(names(aggs)[!names(aggs)%in%c('Vascular')], function(x) RunDiffExprAnalysisLimma(aggs[[x]], metadata[[x]], n.sv=NULL, exclude_batch=TRUE))
names(degs_JBM) = names(aggs)[!names(aggs)%in%c('Vascular')]

Number of significant surrogate variables is:  4 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  4 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  4 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  4 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  4 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  4 
Iteration (out of 5 ):1  2  3  4  5  

In [27]:
# exclude both
vars = c('LOF', 'amyloid', 'nft',  'msex', 'age_death', 'pmi')
sele = rownames(meta)[(meta$seq_batch==1) & (meta$APOE4==0)]
sc_exclude_both = summed_counts_indexed[,ids%in%sele]
limma_inputs = get_limma_inputs(sc_exclude_both, expressed, meta[sele,], vars)
aggs = limma_inputs$aggs
metadata = limma_inputs$metadata
degs_exclude_both = lapply(names(aggs)[!names(aggs)%in%c('Vascular')], function(x) RunDiffExprAnalysisLimma(aggs[[x]], metadata[[x]], n.sv=NULL, exclude_both=TRUE))
names(degs_exclude_both) = names(aggs)[!names(aggs)%in%c('Vascular')]

Number of significant surrogate variables is:  4 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  3 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  3 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  3 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  4 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  

In [28]:
# save the results
degs = list()
degs[['degs_all']] = degs_all
degs[['degs_exclude_both']] = degs_exclude_both
degs[['degs_JBM']] = degs_JBM
degs[['degs_noAPOE4']] = degs_noAPOE4

saveRDS(degs, './processed_data/single_cell/pseudobulk_degs.rds')

In [85]:
# save deg scores as matrix
out = list()
scores = get_deg_scores(degs)
for(i in names(scores)){
    df = as.data.frame(scores[[i]]$scores)
    colnames(df) = c('score')
    df$gene = rownames(df)
    df$celltype = i
    out[[i]] = df
}
all_scores = as.data.frame(do.call('rbind', out)%>%pivot_wider(., values_from='score', names_from='celltype'))
rownames(all_scores) = all_scores$gene
all_scores$gene = NULL
all_scores[is.na(all_scores)] = 0
write.csv(all_scores, './processed_data/single_cell/all_scores.csv')

In [126]:
# load degs
degs = readRDS('./processed_data/single_cell/pseudobulk_degs.rds')$degs_all
#degs = readRDS('../ABCA7lof/processed_data/differentially_expressed_genes_data/abca7lof_pseudobulk_degs_nov10.rds')$degs_all
scores = get_deg_scores(degs)

In [135]:
all_paths = read.csv('./processed_data/genesets/all_paths.csv', row.names = 'X')
pathways = as.list(as.data.frame(t(all_paths)))
temp = lapply(names(pathways), function(x) pathways[[x]][!(pathways[[x]]=='')])
names(temp) = names(pathways)

In [136]:
length(temp)

In [30]:
# get_gset_names_by_category = function(cat, gsets){
#   gset = unlist(lapply(gsets, function(x) unlist(sum(sapply(cat, grepl, x))>0)))
#   gset = (gsets[gset])
#   return(gset)
# }

In [39]:
# o = get_gset_names_by_category(c('sterol', 'lipid', 'glycer', 'fat', 'ceramide', 'phosphatidyl'), names(temp))

In [40]:
# temp2 = temp[o]

In [89]:
# pathways = readRDS('../ABCA7lof/processed_data/pathway_databases/pathways.rds')
# temp2 = pathways$kegg

In [137]:
out = lapply(names(scores), function(x) cbind(x, fgsea(pathways = temp, stats = scores[[x]]$scores, minSize = 5, maxSize = 1000)))
names(out) = names(scores)

In [138]:
res = do.call('rbind', out)
res$score = sign(res$NES) * -log10(res$pval)
res = res[res$x=='Ex',]
res = res[order(res$pval,decreasing=FALSE),]
res = res[res$pval<0.05,]


In [139]:
res

x,pathway,pval,padj,log2err,ES,NES,size,leadingEdge,score
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<list>,<dbl>
Ex,Electron Transport Chain (OXPHOS system in mitochondria) WP111,3.162222e-13,1.264889e-10,0.9325952,0.6721279,2.48313,86,"COX7A2, ....",12.500008
Ex,Oxidative phosphorylation WP623,1.92621e-07,3.833805e-05,0.6901325,0.6648928,2.244224,50,"NDUFS8, ....",6.715296
Ex,Proteasome Degradation WP183,2.875353e-07,3.833805e-05,0.6749629,0.6655312,2.221523,48,"PSMA7, P....",6.541309
Ex,Nonalcoholic fatty liver disease WP4396,4.458981e-06,0.0004458981,0.6105269,0.49506,1.915915,117,"COX7A2, ....",5.350764
Ex,Retinoblastoma Gene in Cancer WP2446,6.471513e-05,0.005177211,0.5384341,0.5845668,1.973098,50,"SMC1A, C....",4.188994
Ex,Mitochondrial complex I assembly model OXPHOS system WP4324,0.0001127272,0.007515144,0.5384341,0.5718491,1.9224,49,"NDUFV2, ....",3.947971
Ex,Peptide GPCRs WP24,0.0005351789,0.03058165,0.4772708,-0.8342118,-1.93302,9,"NPY1R, O....",-3.271501
Ex,Parkin-Ubiquitin Proteasomal System pathway WP2359,0.004224437,0.2112219,0.4070179,0.477928,1.625123,53,"PSMD14, ....",2.374231
Ex,Cell Cycle WP179,0.006997786,0.2792318,0.4070179,0.4440157,1.581564,68,"SMC1A, C....",2.155039
Ex,TCA Cycle (aka Krebs or citric acid cycle) WP78,0.00707993,0.2792318,0.4070179,0.6318732,1.663647,17,"SUCLA2, ....",2.149971


In [140]:
# save all fgsea results
df = as.data.frame(res)
df = df[,!colnames(df)=='leadingEdge']
write.csv(df, './processed_data/for_plotting/fgsea_out.csv')

In [141]:
# save all leading edge genes
df = as.data.frame(unique(unname(unlist(res$leadingEdge))))
colnames(df) = 'gene'
write.csv(df, './processed_data/for_plotting/leading_edge.csv')

In [142]:
# save Ex leading edge genes
res = out$Ex
res = res[res$pval<0.05,]

df = as.data.frame(unique(unname(unlist(res$leadingEdge))))
colnames(df) = 'gene'
write.csv(df, './processed_data/for_plotting/leading_edge_EX.csv')