In [1]:
# load packages
library(edgeR)
library(sva)
source('../../ABCA7lof2/degs.r')
set.seed(5)

Loading required package: limma

Loading required package: mgcv

Loading required package: nlme

This is mgcv 1.9-1. For overview type 'help("mgcv-package")'.

Loading required package: genefilter

Loading required package: BiocParallel



In [2]:
# functions
compute_degs = function (counts.df, mod1, mod0,var, n.sv = NULL)
{
    dge <- DGEList(counts = counts.df)
    dge <- calcNormFactors(dge)

    v <- voom(dge, design = mod1)
    if (is.null(n.sv)) {
        n.sv <- num.sv(v$E, mod1, method = "be")
    }
    svobj <- sva(v$E, mod1, mod0, n.sv = n.sv)
    mod1 <- cbind(mod1, svobj$sv)
    v <- voom(dge, design = mod1)
    fit <- lmFit(v, design = mod1)
    fit <- eBayes(fit)
    #res1 <- topTable(fit, coef = "rs3752246", n = Inf, sort.by = "p")
    res1 <- topTable(fit, coef =var, n = Inf, sort.by = "p")
    return(list(res1 = res1, C = svobj$sv, mod=mod1))
}

In [3]:
# load Henne Holstege et al data
df = read.csv('../../common_variant_data/41588_2022_1208_MOESM4_ESM.csv', skip = 2, header = TRUE)

# merge with ABCA7 variant info
variant_info = read.csv('../../common_variant_data/HIGHandMED_coding_annotations_syn11724057_subset.csv', check.names=FALSE)
variant_info$Variant = paste0(variant_info$CHROM,':',variant_info$POS,':',variant_info$REF_x, '>', variant_info$ALT_0)
df = df[,c('Variant', 'REVEL', 'LOF')]
var_info = merge(variant_info, df, by = 'Variant')
var_info = var_info[var_info$GENE=='ABCA7',]

genotypes = var_info[,unlist(lapply(names(var_info), function(x) sum(startsWith(x, c('ROS', 'MAP', 'SM')))))>0]
genotypes[genotypes=='0/0']=0
genotypes[genotypes=='0/1']=1
genotypes[genotypes=='1/0']=1
genotypes[genotypes=='1/1']=2
df = apply(genotypes,2, function(x){as.numeric(x)})

variant_categories = as.data.frame(colSums(df[var_info$LOF.y==1,]))

temp = as.numeric(unlist(genotypes[var_info$ID=='rs3752246',]))                                  
temp = -1*(temp-2)
variant_categories$rs3752246 = temp

“NAs introduced by coercion”
“NAs introduced by coercion”
“NAs introduced by coercion”
“NAs introduced by coercion”
“NAs introduced by coercion”
“NAs introduced by coercion”


In [4]:
# load our large snRNAseq dataset
data = readRDS('../../common_variant_data/all_summed_cts.rds')

# add variant data to metadata
meta = data$meta
meta = cbind(meta, variant_categories[meta$WGSid,])

In [5]:
# remove individuals from our snRNAseq cohort
all_data = readRDS('../../processed_data/stats_input_data_0825.rds')

keep = !meta$projid%in%rownames(all_data$summary)
meta = meta[keep,]
ncells = data$ncells[keep]
counts = data$counts[,keep]

expressed = all_data$expressed10

In [6]:
# add batch info
batch_ids = read.csv('../../common_variant_data/Fastq_paths_432_PFC_HM_updated_edited.csv')
batch_ids = batch_ids[!duplicated(batch_ids$projid),]
rownames(batch_ids) = batch_ids$projid
meta$seq_batch = batch_ids[rownames(meta),'Batch']

In [7]:
# filter out individuals with too few cells per celltype
keep = ncells>=10
counts_indexed = counts[,keep]
meta_keep = meta[keep,]
meta_keep$niareagansc = as.numeric(meta_keep$niareagansc)

# filter
keep = (rowSums(is.na(meta_keep))==0) & !(is.na(meta_keep[,'colSums(df[var_info$LOF.y == 1, ])']) | (meta_keep[,'colSums(df[var_info$LOF.y == 1, ])']>0)) #& meta_keep$niareagansc%in%(unlist(list('3','4')))
meta_keep2 = meta_keep[keep,]
meta_keep2$apoe = ifelse(meta_keep2$apoe_genotype%in%c(34,44,24), 1, 0)
counts_keep2 = counts_indexed[,keep]
counts_keep2 = counts_keep2[rownames(counts_keep2)%in%expressed$Ex & rowSums(counts_keep2>0)>15,]

In [8]:
meta_keep2$rs3752246_binary = ifelse(meta_keep2$rs3752246>0, 1, 0)

table(meta_keep2$rs3752246_binary)


  0   1 
227 133 

In [9]:
# compute DEGs rs3752246
# remove any LOF individuals

# DEGs all Ex

meta_keep2$rs3752246_binary = ifelse(meta_keep2$rs3752246>0, 1, 0)

mod1 <- model.matrix(~rs3752246_binary + amyloid + nft + msex + age_death + apoe + pmi + seq_batch + niareagansc, data = meta_keep2)
mod0 <- model.matrix(~ amyloid + nft + msex + age_death + apoe + pmi + seq_batch + niareagansc, data = meta_keep2)

out = compute_degs(counts_keep2, mod1, mod0, 'rs3752246_binary')
curr = out$res1
scores = sign(curr$logFC) * -log10(curr$P.Value)
names(scores) = rownames(curr)

write.csv(out$res1, '../../common_variant_data/degs_rs3752246.csv')

Number of significant surrogate variables is:  21 
Iteration (out of 5 ):1  2  3  4  5  

In [None]:
d = readRDS('../../../../Downloads/')

In [16]:
d = readRDS('../../../../Downloads/set3_summed.rds')
d

$summed_counts
33538 x 426 Matrix of class "dgeMatrix"
             11409232 11336574 10260309 10248033 20207013 20112377 10514454
MIR1302-2HG         0        0        0        0        1        0        0
FAM138A             0        0        0        0        0        0        0
OR4F5               0        0        0        0        0        0        0
AL627309.1          8       14        4       48       82       16       30
AL627309.3          0        0        0        0        0        0        0
AL627309.2          0        0        0        0        0        0        0
AL627309.4          0        0        0        0        0        0        0
AL732372.1          0        0        0        0        0        0        0
OR4F29              0        0        0        0        0        0        0
AC114498.1          0        2        0        1        3        1        2
OR4F16              0        0        0        0        0        0        0
AL669831.2          1        0   

In [10]:
d = readRDS('../../../../Downloads/set1_summed.rds')

counts = d$summed_counts

counts = counts[row.names(counts)%in%expressed$Ex & rowSums(as.matrix(counts)>0)>15,]

index = colnames(d$summed_counts)%in%rownames(meta_keep2)

counts = counts[,index]
meta = meta_keep2[colnames(counts),]


mod1 <- model.matrix(~rs3752246_binary + amyloid + nft + msex + age_death + apoe + pmi + seq_batch + niareagansc, data = meta)
mod0 <- model.matrix(~ amyloid + nft + msex + age_death + apoe + pmi + seq_batch + niareagansc, data = meta)

out = compute_degs(counts, mod1, mod0, 'rs3752246_binary')
curr = out$res1
scores = sign(curr$logFC) * -log10(curr$P.Value)
names(scores) = rownames(curr)

write.csv(out$res1, '../../common_variant_data/degs_rs3752246_set1.csv')

Number of significant surrogate variables is:  29 
Iteration (out of 5 ):1  2  3  4  5  

In [11]:
d = readRDS('../../../../Downloads/set2_summed.rds')

counts = d$summed_counts

counts = counts[row.names(counts)%in%expressed$Ex & rowSums(as.matrix(counts)>0)>15,]

index = colnames(d$summed_counts)%in%rownames(meta_keep2)

counts = counts[,index]
meta = meta_keep2[colnames(counts),]


mod1 <- model.matrix(~rs3752246_binary + amyloid + nft + msex + age_death + apoe + pmi + seq_batch + niareagansc, data = meta)
mod0 <- model.matrix(~ amyloid + nft + msex + age_death + apoe + pmi + seq_batch + niareagansc, data = meta)

out = compute_degs(counts, mod1, mod0, 'rs3752246_binary')
curr = out$res1
scores = sign(curr$logFC) * -log10(curr$P.Value)
names(scores) = rownames(curr)

write.csv(out$res1, '../../common_variant_data/degs_rs3752246_set2.csv')

Number of significant surrogate variables is:  26 
Iteration (out of 5 ):1  2  3  4  5  

In [12]:
d = readRDS('../../../../Downloads/set3_summed.rds')

counts = d$summed_counts

counts = counts[row.names(counts)%in%expressed$Ex & rowSums(as.matrix(counts)>0)>15,]

index = colnames(d$summed_counts)%in%rownames(meta_keep2)

counts = counts[,index]
meta = meta_keep2[colnames(counts),]


mod1 <- model.matrix(~rs3752246_binary + amyloid + nft + msex + age_death + apoe + pmi + seq_batch + niareagansc, data = meta)
mod0 <- model.matrix(~ amyloid + nft + msex + age_death + apoe + pmi + seq_batch + niareagansc, data = meta)

out = compute_degs(counts, mod1, mod0, 'rs3752246_binary')
curr = out$res1
scores = sign(curr$logFC) * -log10(curr$P.Value)
names(scores) = rownames(curr)

write.csv(out$res1, '../../common_variant_data/degs_rs3752246_set3.csv')

Number of significant surrogate variables is:  22 
Iteration (out of 5 ):1  2  3  4  5  