**Process some of the external/non-single cell datasets for plotting**

#### ABCA7 protein levels by genotype

In [32]:
# intitial check
meta = read.csv('./processed_data/rosmap_proteomics//0.Traits-AGE_CENSORED.csv')
biospecimen = read.csv('./raw_data/metadata/ROSMAP_biospecimen_metadata.csv')
biospecimen_subset = biospecimen[biospecimen$assay=='TMT quantitation',]
rownames(biospecimen_subset) = biospecimen_subset$specimenID
biospecimen_subset = biospecimen_subset[meta$SpecimenID,]
biospecimen_subset = biospecimen_subset[biospecimen_subset$individualID!='GISpool',]
paste0('tissue used for TMT proteomics = ', unique(biospecimen_subset$tissue))

In [33]:
# load data
data = read.csv('./processed_data/rosmap_proteomics//3.cleanDat.csv', row.names = 'X')
meta = read.csv('./processed_data/rosmap_proteomics//0.Traits-AGE_CENSORED.csv')
all_samples_lof_genotypes = read.csv('./processed_data/rosmap_proteomics/all_samples_lof_genotypes.csv', check.names=F)
all_samples_lof_summary = read.csv('./processed_data/rosmap_proteomics/all_samples_lof_summary.csv', row.names = 'X')

In [34]:
all_meta = merge(meta, all_samples_lof_summary, by = 'projid')

In [35]:
temp = all_samples_lof_genotypes[all_samples_lof_genotypes$GENE=='ABCA7',]
temp2 = temp[,colnames(temp)%in%all_meta[all_meta$ABCA7LoF==1,'projid']]
index = rowSums(temp2=='0/1')>0
var_info = cbind(temp[index,1:18], temp2[index,])

In [36]:
all_data = readRDS('./processed_data/single_cell/stats_input_data.rds')
summary = all_data$summary

# plot genes of interest
rownames(all_meta) = all_meta$SampleID
all_meta = all_meta[rownames(all_meta)%in%colnames(data),]
all_meta$grp = all_meta$projid%in%rownames(summary)
df = as.data.frame(t(data[startsWith(rownames(data), c('ABCA7')),all_meta$SampleID]))
df$RBFOX3 = (t(data[startsWith(rownames(data), c('RBFOX3')),all_meta$SampleID]))
df$LOF = all_meta[rownames(df), 'ABCA7LoF']
df$grp = all_meta[rownames(df), 'grp']

colnames(df) = c('ABCA7', 'RBFOX3','LOF', 'grp')
df$projid = all_meta[rownames(df), 'projid']
df = na.omit(df)

In [37]:
library(reshape2)
temp = melt(var_info[, c('HGVS_C', 84653463,20201891,20201927,50403446,71648351,50105301)], id='HGVS_C')
temp = temp[temp$value=='0/1',]
rownames(temp) = temp$variable
df$var = temp[as.character(df$projid),'HGVS_C']

In [38]:
write.csv(df, './processed_data//for_plotting/ABCA7_proteomics.csv')

In [46]:
table(df$grp, df$var)

       
        c.2126_2132delAGCAGGG c.3255G>A c.5570+5G>C
  FALSE                     0         0           1
  TRUE                      1         1           2

#### Marker genes

In [1]:
library(reshape2)
library(SingleCellExperiment)
library(tidyr)

Loading required package: SummarizedExperiment

Loading required package: MatrixGenerics

Loading required package: matrixStats


Attaching package: ‘MatrixGenerics’


The following objects are masked from ‘package:matrixStats’:

    colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
    colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
    colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
    colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
    colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
    colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
    colWeightedMeans, colWeightedMedians, colWeightedSds,
    colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
    rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
    rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
    rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
    rowOrderStats, rowProds, rowQuantiles, rowRanges

In [2]:
ace_dir = './processed_data/single_cell/sce.rds'
ace = readRDS(ace_dir)

In [3]:
marker_genes = c('SYT1', 'NRGN', 'GAD1', 'AQP4', 'CSF1R', 'MBP', 'PLP1', 'VCAN',  'PDGFRB', 'FLT1')
print('getting marker genes')
marker_logcounts = logcounts(ace)[marker_genes,] 
print('melting')
marker_logcounts_melted = melt(as.matrix(marker_logcounts))

marker_logcounts_melted$celltype = colData(ace)[marker_logcounts_melted$Var2,'annotations2']

df = as.data.frame(cbind(ace@colData$projid, ace@colData$annotations2, ace@colData$ABCA7LoF))
colnames(df) = c('projid', 'celltype', 'LOF')
cells = unique(df$celltype)
df$celltype = factor(df$celltype, levels = cells[order(cells)])

[1] "getting marker genes"
[1] "melting"


In [4]:
write.csv(df, './processed_data/for_plotting/celltype_annos_qc.csv')
write.csv(marker_logcounts_melted, './processed_data/for_plotting/marker_logcounts_melted.csv')

In [8]:
# show individual-level correlation plot
all_data = readRDS('./processed_data/single_cell/stats_input_data_0825.rds')

logcounts_ind = all_data$av_logcounts_by_ind_full_matrix
logcounts_ind = logcounts_ind[ , colSums(is.na(logcounts_ind))==0]
anno_names = unlist(lapply(1:length(colnames(logcounts_ind)), function(x) strsplit(colnames(logcounts_ind)[x], '[.]')[[1]][[1]]))
ind_cor = cor(logcounts_ind)

logcounts_clust = all_data$av_logcounts_by_cluster_full_matrix
logcounts_ind = logcounts_clust[ , colSums(is.na(logcounts_clust))==0]
anno_names = unlist(lapply(1:length(colnames(logcounts_clust)), function(x) strsplit(colnames(logcounts_clust)[x], '[.]')[[1]][[1]]))
clust_cor = cor(logcounts_clust)
                           
# show cross correlations
df = melt(ind_cor)[melt(lower.tri(ind_cor, diag = F))$value,]
df = df %>% separate(
  .,
  'Var1',
  c('celltype1', 'projid1'),
  sep ='[.]') %>% separate(
  .,
  'Var2',
  c('celltype2', 'projid2'),
  sep ='[.]') 
df_subset = df[(df$projid1!=df$projid2) & (df$celltype1 == df$celltype2),]
df_subset$value = as.numeric(df_subset$value)

means = aggregate(df_subset$value, list(df_subset$celltype1), 'mean')
order = means[order(means$x, decreasing = T),'Group.1']
df_subset$celltype1 = factor(df_subset$celltype1, levels = order)
                                               
# show median number of cells per subject detected
x = as.matrix(table(ace$projid, ace$annotations2))
df = as.data.frame(apply(x, 2, function(i) median(i)))
colnames(df) = 'median'
df$celltype = rownames(df)
df$celltype = factor(df$celltype, levels = df$celltype[order(df$median, decreasing = T)])

# show number of individuals with >10 cells per cell type
x = as.matrix(table(ace$projid, ace$annotations2))
df1 = as.data.frame(colSums(x>=10))
colnames(df1) = 'N'
df1$celltype = rownames(df1)
df1$celltype = factor(df1$celltype, levels = df1$celltype[order(df1$N, decreasing = T)])

# show N subjects with no cells detected
x = as.matrix(table(ace$projid, ace$annotations2))
df2 = as.data.frame(colSums(x==0))
colnames(df2) = 'N'
df2$celltype = rownames(df)
df2$celltype = factor(df2$celltype, levels = df2$celltype[order(df2$N, decreasing = T)])


In [11]:
data = list('ind_cor'=ind_cor, 'clust_cor' = clust_cor, 'cross_cors'=df_subset, 'median_cells'=df, 'N_cells'=df1, 'no_cells'=df2)

In [12]:
saveRDS(data, './processed_data/for_plotting/celltype_anno_counts.rds')

#### Lipidomics

In [None]:
source('./ABCA7lof2//prep_data.r')

In [None]:
# load and pre-process the data
data = read.csv('./raw_data/ngn2_data/1096.SUB12877_lipidXData (1).csv')
meta = as.data.frame(read_excel('./raw_data/ngn2_data/9033.NGN2 lipidomics_05102023.xlsx'))

lipid_metadata = data[,1:12]
areas = data[,43:57]
stats = data[,222:233]

x = strsplit(colnames(areas), '[.]')
sample_names = lapply(x, function(i) paste0(i[2],'.',i[3]))
meta$sample_names = sample_names
colnames(areas) = sample_names
                      
rownames(meta) = meta$sample_names
rownames(lipid_metadata) = lipid_metadata$name
rownames(stats) = lipid_metadata$name
rownames(areas) = lipid_metadata$name
                      
sce_ngn2 <- SingleCellExperiment(list(counts=areas), 
                           colData=meta,
                           rowData=list(lipid=lipid_metadata, stats=stats)) %>% get_fatty_acid_info(., 'lipid.fattyacid', 'lipid.class')

sce = sce_ngn2[,colData(sce_ngn2)$treatment!='Choline']
df = compute_stats(sce, 'Control', 'ABCA7 LoF')
rownames(df) = df$name
colnames(df) = c('name', 'pvals_both_batch', 'logfc_both_batch', 'score_both_batch')
rowData(sce_ngn2) = cbind(rowData(sce_ngn2), df[rownames(rowData(sce_ngn2)),c('pvals_both_batch', 'logfc_both_batch', 'score_both_batch')])

In [None]:
# same for PM data
data = read.csv('./processed_data//postmortem_lipdiomics/3223.SUB12418_LipidomicsData_sheet1_modified.csv', sep = '\t', check.names = FALSE)
lipid_metadata = data[,1:11]
areas = data[,12:27]
stats = data[,28:34]

meta = read.csv('./raw_data/metadata/lipidomic_sample_code.csv')
x = strsplit(meta$Label, '[.]')
rownames(meta) = lapply(x, function(i) paste0(i[1], i[2]))
meta = meta[colnames(areas),]
                        
rownames(lipid_metadata) = lipid_metadata$name
rownames(stats) = lipid_metadata$name
rownames(areas) = lipid_metadata$name

sce_pm <- SingleCellExperiment(list(counts=areas), 
                           colData=meta,
                           rowData=list(lipid=lipid_metadata, stats=stats)) %>% get_fatty_acid_info(., 'lipid.fattyacid', 'lipid.key')
                      

In [None]:
saveRDS(list('ngn2_all'=sce_ngn2, 'pm_all'=sce_pm), './processed_data/for_plotting/lipidomics.rds')

#### NGN2 CRISPRi

In [1]:
source('./ABCA7lof2//degs.r')

library(tidyr)
library(fgsea)
library(ComplexHeatmap)


In [2]:
degs = readRDS('./processed_data/single_cell/pseudobulk_degs_0825.rds')$degs_all
scores = get_deg_scores(degs)
all_data = readRDS('./processed_data/single_cell/stats_input_data_0825.rds')

In [3]:
df0 = read.csv('../ABCA7lof//processed_data/crisprbrain//Glutamatergic Neuron-RNA-Seq-CRISPRi-2020.csv')


In [5]:
all_genes = split(df0, df0$name)
gene_names = names(all_genes)
gene_names = gene_names[gene_names%in%all_data$expressed25$Ex]


In [6]:
paths = read.csv('./supplementary_tables/data_s8.csv')
paths = paths[paths$is_gene=='True',]
P = list()
for(i in unique(paths$cluster)){
    P[[as.character(i)]] = paths[paths$cluster==i,'description']
}

In [20]:
x = abs(scores$Ex$scores[gene_names])
x = x[order(x,decreasing=TRUE)]
gene_names = names(x[x>.5])

out = list()
for (g in gene_names){
    x = sign(all_genes[[g]]$Log2FC) * -log10(all_genes[[g]]$P.Value)
    names(x) = all_genes[[g]]$Gene
    x = x[all_genes[[g]]$Log2CPM>0]
    x = x[!duplicated(x)]
    x = x[order(x,decreasing = TRUE)]
    x = x[!names(x)==g]
    out[[g]] = x
}

O = lapply(names(out), function(x) cbind(x, fgsea(pathways = P, stats = out[[x]], minSize = 5, maxSize = 1000, nproc=1, nPermSimple=10000)))
names(O) = names(out)
           
df = do.call('rbind', O)
df$score = sign(df$NES) * -log10(df$pval)
df = df%>%pivot_wider(values_from = score, names_from = pathway, id_cols  = x)
df = as.data.frame(df)
rownames(df) = df$x
df$x = NULL
           
x = rowSums(abs(df)>3)
N = names(x[x>0])
           
options(repr.plot.width=3.5, repr.plot.height=10)

pdf('./pdf_figures/Extended_Figure_11_hmap.pdf', width=3.5, height=5)
Heatmap(((df[N,])), cluster_columns=TRUE)
dev.off()








































“The input is a data frame-like object, convert it to a matrix.”


In [26]:
write.csv(df, './supplementary_tables/data_s11.csv')