**Format data for input to stats analysis**


In [1]:
source('./ABCA7lof2/prep_data.r')
library(tidyr)
library('SingleCellExperiment')

“package ‘tidyr’ was built under R version 3.6.3”Loading required package: SummarizedExperiment
Loading required package: GenomicRanges
Loading required package: stats4
Loading required package: BiocGenerics
Loading required package: parallel

Attaching package: ‘BiocGenerics’

The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB

The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs

The following objects are masked from ‘package:base’:

    Filter, Find, Map, Position, Reduce, anyDuplicated, append,
    as.data.frame, basename, cbind, colnames, dirname, do.call,
    duplicated, eval, evalq, get, grep, grepl, intersect, is.unsorted,
    lapply, mapply, match, mget, order, paste, pmax, pmax.int, pmin,
    pmin.int, rank, rbind, rownames, sapply, setdiff, sort, table,
    tapply

In [2]:
# define vars
order = c('Ex', 'In', 'Ast', 'Mic', 'Oli', 'Opc', 'Vascular')
sce = readRDS('./processed_data/single_cell/sce.rds')
output_path = './processed_data/single_cell/stats_input_data_0825.rds'

In [9]:
# use matrix multiplication to summarize (sum) across counts per cell type per individual
print('summing...')
meta = sce@colData
labels = as.data.frame(as.character(interaction(meta$annotations2, meta$projid)))
cell_labels = rownames(meta)
summed_counts_cellxind = sum_counts(assays(sce)$counts, labels, cell_labels)
summed_logcounts_cellxind = sum_counts(assays(sce)$logcounts, labels, cell_labels)

[1] "summing..."


In [10]:
# sum across counts per cell type x cluster
labels = as.data.frame(as.character(interaction(meta$leiden_clusters, meta$projid)))
summed_counts_cellxcluster = sum_counts(assays(sce)$counts, labels, cell_labels)
summed_logcounts_cellxcluster = sum_counts(assays(sce)$logcounts, labels, cell_labels)

In [11]:
# use matrix multiplication to summarize (sum) across counts per cell (including all individuals)
summed_logcounts_cell = sum_counts(assays(sce)$logcounts, label = as.data.frame(meta$annotations2), cell_labels)

In [12]:
# get averages corresponding to both count matrices
avs_logcounts_cellxcluster = t(apply(summed_logcounts_cellxcluster$summed_counts, 1, function(x){x/summed_logcounts_cellxcluster$ncells}))
avs_logcounts_cellxind = t(apply(summed_logcounts_cellxind$summed_counts, 1, function(x){x/summed_logcounts_cellxind$ncells}))
avs_logcounts_cell = t(apply(summed_logcounts_cell$summed_counts, 1, function(x){x/summed_logcounts_cell$ncells}))

In [13]:
# in how many cells per celltype is each gene detected?
counts_nonzero = assays(sce)$counts>0
detected_genes_cell = sum_counts(counts_nonzero, label = as.data.frame(meta$annotations2), cell_labels)
fraction_detected_genes_cell = t(apply(detected_genes_cell$summed_counts, 1, function(x){x/detected_genes_cell$ncells}))

In [14]:
# get expression list 10%
expressed25 = get_expressed_genes(fraction_detected_genes_cell, .25)
expressed10 = get_expressed_genes(fraction_detected_genes_cell, .10)

In [15]:
# summarize the experiment by celltype x individual
print('summarizing experiment by individual...')
x = (strsplit(colnames(avs_logcounts_cellxind), '[.]'))
celltype = unlist(lapply(1:length(x), function(i) x[[i]][[1]]))
individual = unlist(lapply(1:length(x), function(i) x[[i]][[2]]))
celltype_unique = unique(celltype)
avs_by_ind_out = list()
for(i in celltype_unique){
    index = celltype==i
    df = avs_logcounts_cellxind[, index]
    colnames(df) = individual[index]
    avs_by_ind_out[[i]] = df
}


[1] "summarizing experiment by individual..."


In [21]:
# get metadata summary
summary = read.csv('./raw_data/metadata/single_cell_individual_metadata.csv', row.names='projid')
summary$APOE4 = ifelse(summary$apoe_genotype%in%c(24, 44, 34), 1, 0)
summary$LOF = summary$ABCA7LoF

In [32]:
# subset because some individuals were removed
summary = summary[as.character(summary$sample_id)%in%unique(meta$sample_id),]

In [35]:
# save all the data
all_data = list()

all_data[['summed_counts_by_ind']] = summed_counts_cellxind
all_data[['av_logcounts_by_celltype']] = avs_logcounts_cell
all_data[['av_logcounts_by_ind']] = avs_by_ind_out
all_data[['av_logcounts_by_ind_full_matrix']] = avs_logcounts_cellxind
all_data[['av_logcounts_by_cluster_full_matrix']] = avs_logcounts_cellxcluster
all_data[['det.rate.celltype']] = fraction_detected_genes_cell
all_data[['expressed10']] = expressed10
all_data[['expressed25']] = expressed25
all_data[['summary']] = summary

In [36]:
# save the data
saveRDS(all_data, output_path)