In [38]:
library(edgeR)
library(sva)
library(fgsea)
library(tidyr)
source('./ABCA7lof2//degs.r')

In [39]:
all_data = readRDS('./processed_data/single_cell/stats_input_data_0825.rds')

In [3]:
summed_counts = all_data$summed_counts_by_ind # does it make sense to be summing on the counts vs taking the mean of the counts? --> compare to nebula? --> ok because we are normalizing?
meta = all_data$summary
meta$seq_batch = ifelse(meta$seq_batch=='JBM',1,0)
expressed = all_data$expressed10

In [4]:
# filter out individuals with too few cells per celltype
keep = summed_counts$ncells>=10
summed_counts_indexed = summed_counts$summed_counts[,keep]

In [14]:
# compute degs (all samples) without gene cutoff
exp = lapply(names(expressed), function(x) rownames(summed_counts$summed_counts))
names(exp) = names(expressed)
             
vars = c('LOF', 'amyloid', 'nft',  'msex', 'age_death', 'pmi', 'seq_batch', 'APOE4')
limma_inputs = get_limma_inputs(summed_counts_indexed, exp, meta, vars)
aggs = limma_inputs$aggs
metadata = limma_inputs$metadata


degs_all = lapply(names(aggs)[!names(aggs)%in%c('Vascular')], function(x) RunDiffExprAnalysisLimma(aggs[[x]], metadata[[x]], n.sv=NULL))
names(degs_all) = names(aggs)[!names(aggs)%in%c('Vascular')]

Number of significant surrogate variables is:  4 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  4 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  3 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  4 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  3 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  3 
Iteration (out of 5 ):1  2  3  4  5  

In [30]:
saveRDS(degs_all,'./processed_data/for_plotting/degs_no_exp.rds')

In [13]:
# compute degs (all samples)
vars = c('LOF', 'amyloid', 'nft',  'msex', 'age_death', 'pmi', 'seq_batch', 'APOE4')
limma_inputs = get_limma_inputs(summed_counts_indexed, expressed, meta, vars)
aggs = limma_inputs$aggs
metadata = limma_inputs$metadata


degs_all = lapply(names(aggs)[!names(aggs)%in%c('Vascular')], function(x) RunDiffExprAnalysisLimma(aggs[[x]], metadata[[x]], n.sv=NULL))
names(degs_all) = names(aggs)[!names(aggs)%in%c('Vascular')]

Number of significant surrogate variables is:  10 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  9 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  5 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  7 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  9 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  7 
Iteration (out of 5 ):1  2  3  4  5  

In [14]:
# get the ids
ids = strsplit(colnames(summed_counts_indexed), '[.]')
ids = unlist(lapply(1:length(ids), function(x) ids[[x]][2]))

In [15]:
# compute degs (no APOE4 samples)
vars = c('LOF', 'amyloid', 'nft',  'msex', 'age_death', 'pmi', 'seq_batch')
sele = rownames(meta)[meta$APOE4==0]
sc_noAPOE4 = summed_counts_indexed[,ids%in%sele]
limma_inputs = get_limma_inputs(sc_noAPOE4, expressed, meta[sele,], vars)
aggs = limma_inputs$aggs
metadata = limma_inputs$metadata
degs_noAPOE4 = lapply(names(aggs)[!names(aggs)%in%c('Vascular')], function(x) RunDiffExprAnalysisLimma(aggs[[x]], metadata[[x]], n.sv=NULL, exclude_apoe=TRUE))
names(degs_noAPOE4) = names(aggs)[!names(aggs)%in%c('Vascular')]


Number of significant surrogate variables is:  6 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  6 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  4 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  5 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  5 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  5 
Iteration (out of 5 ):1  2  3  4  5  

In [16]:
# compute degs (no APOE4 samples)
vars = c('LOF', 'amyloid', 'nft',  'msex', 'age_death', 'pmi', 'APOE4')
sele = rownames(meta)[meta$seq_batch==1]
sc_JBM = summed_counts_indexed[,ids%in%sele]
limma_inputs = get_limma_inputs(sc_JBM, expressed, meta[sele,], vars)
aggs = limma_inputs$aggs
metadata = limma_inputs$metadata
degs_JBM = lapply(names(aggs)[!names(aggs)%in%c('Vascular')], function(x) RunDiffExprAnalysisLimma(aggs[[x]], metadata[[x]], n.sv=NULL, exclude_batch=TRUE))
names(degs_JBM) = names(aggs)[!names(aggs)%in%c('Vascular')]

Number of significant surrogate variables is:  5 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  4 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  3 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  5 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  3 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  3 
Iteration (out of 5 ):1  2  3  4  5  

In [17]:
# exclude both
vars = c('LOF', 'amyloid', 'nft',  'msex', 'age_death', 'pmi')
sele = rownames(meta)[(meta$seq_batch==1) & (meta$APOE4==0)]
sc_exclude_both = summed_counts_indexed[,ids%in%sele]
limma_inputs = get_limma_inputs(sc_exclude_both, expressed, meta[sele,], vars)
aggs = limma_inputs$aggs
metadata = limma_inputs$metadata
degs_exclude_both = lapply(names(aggs)[!names(aggs)%in%c('Vascular')], function(x) RunDiffExprAnalysisLimma(aggs[[x]], metadata[[x]], n.sv=NULL, exclude_both=TRUE))
names(degs_exclude_both) = names(aggs)[!names(aggs)%in%c('Vascular')]

Number of significant surrogate variables is:  4 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  4 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  4 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  3 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  

In [18]:
# save the results
degs = list()
degs[['degs_all']] = degs_all
degs[['degs_exclude_both']] = degs_exclude_both
degs[['degs_JBM']] = degs_JBM
degs[['degs_noAPOE4']] = degs_noAPOE4

saveRDS(degs, './processed_data/single_cell/pseudobulk_degs_0825.rds')

In [25]:
# save deg scores as matrix
out = list()
scores = get_deg_scores(degs$degs_all)
for(i in names(scores)){
    df = as.data.frame(scores[[i]]$scores)
    colnames(df) = c('score')
    df$gene = rownames(df)
    df$celltype = i
    out[[i]] = df
}
all_scores = as.data.frame(do.call('rbind', out)%>%pivot_wider(., values_from='score', names_from='celltype'))
rownames(all_scores) = all_scores$gene
all_scores$gene = NULL
all_scores[is.na(all_scores)] = 0
write.csv(all_scores, './processed_data/single_cell/all_scores_0825.csv')

In [40]:
# load degs
degs = readRDS('./processed_data/single_cell/pseudobulk_degs_0825.rds')$degs_all
scores = get_deg_scores(degs)

In [32]:
# compute fgsea results
all_paths = read.csv('./processed_data/genesets/all_paths.csv', row.names = 'X')
pathways = as.list(as.data.frame(t(all_paths)))
temp = lapply(names(pathways), function(x) pathways[[x]][!(pathways[[x]]=='')])
names(temp) = names(pathways)
              
out = lapply(names(scores), function(x) cbind(x, fgsea(pathways = temp, stats = scores[[x]]$scores, minSize = 5, maxSize = 1000)))
names(out) = names(scores)

In [33]:
# save the fgsea results
saveRDS(out, './processed_data/for_plotting/fgsea_out_0825.rds')

# save all LE results
df = as.data.frame(do.call('rbind', out))
df = df[,!colnames(df)=='leadingEdge']
write.csv(df, './processed_data/for_plotting/fgsea_out_0825.csv')

# save all fgsea results as csv
df = as.data.frame(do.call('rbind', out))
df = df[,!colnames(df)=='leadingEdge']
write.csv(df, './processed_data/for_plotting/fgsea_0825.csv')

# save LE genes per celltype
for(i in names(out)){
    res = out[[i]]
    res = res[res$pval<0.05,]

    df = as.data.frame(unique(unname(unlist(res$leadingEdge))))
    colnames(df) = 'gene'
    write.csv(df, paste0(paste0('./processed_data/for_plotting/leading_edge_0825', i), '.csv'))
}

In [34]:
# save all LEs
out = readRDS('./processed_data/for_plotting/fgsea_out_0825.rds')
res = do.call('rbind', out)
res$score = sign(res$NES) * -log10(res$pval)
res = res[order(res$pval,decreasing=FALSE),]
res = res[res$pval<0.05,]
# save all leading edge genes
df = as.data.frame(unique(unname(unlist(res$leadingEdge))))
colnames(df) = 'gene'
write.csv(df, './processed_data/for_plotting/leading_edge_0825.csv')

In [41]:
# enrichment for lipid genesets
get_gset_names_by_category = function(cat, gsets){
  gset = unlist(lapply(gsets, function(x) unlist(sum(sapply(cat, grepl, x))>0)))
  gset = (gsets[gset])
  return(gset)
}

pathways = readRDS('../ABCA7lof/processed_data/pathway_databases/pathways.rds')$all_paths

                       
o = get_gset_names_by_category(c('sterol', 'lipid', 'glycer', 'fatt', 'ceramide', 'phosphatidyl'), names(pathways))
temp2 = pathways[o]
out = lapply(names(scores), function(x) cbind(x, fgsea(pathways = temp2, stats = scores[[x]]$scores, minSize = 5, maxSize = 1000)))
names(out) = names(scores)
                       

In [42]:
# save lipid paths
res = out$Ex
res = res[order(res$pval,decreasing=FALSE),]
res = res[res$pval<0.05,]
saveRDS(res, './processed_data/for_plotting/fgsea_out_ex_lipid.rds')

In [14]:
pathways = readRDS('../ABCA7lof/processed_data/pathway_databases/pathways.rds')$all_paths
o = get_gset_names_by_category(c('kappa'), names(pathways))
temp2 = pathways[o]

In [16]:
# load degs
degs = readRDS('./processed_data/single_cell/pseudobulk_degs_0825.rds')$degs_all
scores = get_deg_scores(degs)
out = lapply(names(scores), function(x) cbind(x, fgsea(pathways = temp2, stats = scores[[x]]$scores, minSize = 5, maxSize = 1000)))
names(out) = names(scores)

In [18]:
p = readRDS('../ABCA7lof/processed_data/pathway_databases/pathways.rds')

In [21]:
p$kegg['NF-kappa B signaling pathway']

In [22]:
out$Ex['leadingEdge']

ERROR: Error in `[.data.table`(out$Ex, "leadingEdge"): When i is a data.table (or character vector), the columns to join by must be specified using 'on=' argument (see ?data.table), by keying x (i.e. sorted, and, marked as sorted, see ?setkey), or by sharing column names between x and i (i.e., a natural join). Keyed joins might have further speed benefits on very large data due to x being sorted in RAM.


# below is scrap

In [47]:
pathways = readRDS('../ABCA7lof/processed_data/pathway_databases/pathways.rds')$all_paths
o = get_gset_names_by_category(c('sterol', 'lipid', 'glycer', 'fatt', 'ceramide', 'phosphatidyl'), names(pathways))
temp2 = pathways[o]
out = lapply(names(scores), function(x) cbind(x, fgsea(pathways = temp2, stats = scores[[x]]$scores, minSize = 5, maxSize = 1000)))
names(out) = names(scores)

In [55]:
res = out$Ex
res = res[order(res$pval,decreasing=FALSE),]
res = res[res$pval<0.05,]
saveRDS(res, './processed_data/for_plotting/fgsea_out_ex_lipid.rds')

In [89]:
# pathways = readRDS('../ABCA7lof/processed_data/pathway_databases/pathways.rds')
# temp2 = pathways$kegg

In [4]:
out = lapply(names(scores), function(x) cbind(x, fgsea(pathways = temp, stats = scores[[x]]$scores, minSize = 5, maxSize = 1000)))
names(out) = names(scores)

In [5]:
saveRDS(out, './processed_data/for_plotting/fgsea_out.rds')

In [6]:
# res = do.call('rbind', out)
# res$score = sign(res$NES) * -log10(res$pval)
# res = res[res$x=='Ex',]
# res = res[order(res$pval,decreasing=FALSE),]
# res = res[res$pval<0.05,]


In [5]:
# save all fgsea results
df = as.data.frame(do.call('rbind', out))
df = df[,!colnames(df)=='leadingEdge']
write.csv(df, './processed_data/for_plotting/fgsea_out.csv')

In [61]:
d= read.csv('./processed_data/for_plotting/fgsea_out.csv')

In [64]:
d[d$x=='Ex' & d$pval<0.05,]

Unnamed: 0_level_0,X,x,pathway,pval,padj,log2err,ES,NES,size
Unnamed: 0_level_1,<int>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>
1070,1070,Ex,ATM Signaling Network in Development and Disease WP3878,0.02642628,0.5768063,0.3524879,0.4661167,1.486559,38
1071,1071,Ex,ATM Signaling Pathway WP2516,0.04708067,0.5768063,0.3217759,0.5113852,1.489119,25
1107,1107,Ex,Cell Cycle WP179,0.007740895,0.2814871,0.4070179,0.4440157,1.575,68
1125,1125,Ex,Cytosine methylation WP3585,0.01421194,0.4162774,0.3807304,-0.7201311,-1.667474,9
1127,1127,Ex,DNA Damage Response (only ATM dependent) WP710,0.04118616,0.5768063,0.2878571,0.381356,1.374809,72
1128,1128,Ex,DNA Damage Response WP707,0.02540257,0.5768063,0.3524879,0.4703205,1.499966,38
1130,1130,Ex,DNA IR-damage and cellular response via ATR WP4016,0.01460523,0.4162774,0.3807304,0.4356262,1.49688,58
1141,1141,Ex,Dual hijack model of Vif in HIV infection WP3300,0.007636293,0.2814871,0.4070179,0.7909029,1.678167,7
1147,1147,Ex,Ebola Virus Pathway on Host WP4217,0.03839155,0.5768063,0.3217759,0.3879516,1.398586,72
1150,1150,Ex,Electron Transport Chain (OXPHOS system in mitochondria) WP111,4.482974e-13,1.793189e-10,0.9325952,0.6721279,2.49463,86


In [7]:
res = do.call('rbind', out)
res$score = sign(res$NES) * -log10(res$pval)
res = res[order(res$pval,decreasing=FALSE),]
res = res[res$pval<0.05,]
# save all leading edge genes
df = as.data.frame(unique(unname(unlist(res$leadingEdge))))
colnames(df) = 'gene'
write.csv(df, './processed_data/for_plotting/leading_edge.csv')

In [10]:
# save Ex leading edge genes
for(i in names(out)){
    res = out[[i]]
    res = res[res$pval<0.05,]

    df = as.data.frame(unique(unname(unlist(res$leadingEdge))))
    colnames(df) = 'gene'
    write.csv(df, paste0(paste0('./processed_data/for_plotting/leading_edge_', i), '.csv'))
}

In [16]:
temp = out[['Opc']]
temp = temp[order(temp$pval,decreasing=FALSE),]
temp[temp$pval<0.05,]

x,pathway,pval,padj,log2err,ES,NES,size,leadingEdge
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<list>
Opc,Aryl Hydrocarbon Receptor Pathway WP2873,0.002572431,0.6554441,0.4317077,-0.7207746,-1.837274,14,"PTGES3, ...."
Opc,ncRNAs involved in STAT3 signaling in hepatocellular carcinoma WP4337,0.005053618,0.6554441,0.4070179,0.8148782,1.682967,7,"IL6ST, J...."
Opc,Genes related to primary cilium development (based on CRISPR) WP4536,0.005215735,0.6554441,0.4070179,-0.4228744,-1.574863,72,"CEP162, ...."
Opc,Calcium Regulation in the Cardiac Cell WP536,0.010014804,0.8365105,0.3807304,-0.4167157,-1.562717,77,"SLC8A3, ...."
Opc,NLR Proteins WP288,0.011906788,0.8365105,0.3807304,0.8512859,1.605635,5,"MAP3K7, ...."
Opc,The human immune response to tuberculosis WP4197,0.013313164,0.8365105,0.3807304,0.6736173,1.713088,13,"IFNAR1, ...."
Opc,Iron metabolism in placenta WP2007,0.017136413,0.9006333,0.3524879,0.7922463,1.57965,6,"FTH1, TF"
Opc,Cholesterol Biosynthesis Pathway WP197,0.028558719,0.9006333,0.3524879,0.6690772,1.620472,11,"IDI1, MS...."
Opc,Type III interferon signaling WP2113,0.028759138,0.9006333,0.3524879,0.7700633,1.53542,6,"IL10RB, JAK1"
Opc,Thymic Stromal LymphoPoietin (TSLP) Signaling Pathway WP2203,0.02970142,0.9006333,0.3524879,0.5435175,1.5897,22,"NFKBIA, ...."
