# eQTL in Epigenetic Data

**Created**: 18 February 2022

The goal here is to perform a simple comparison of *cis*-eQTL detected in GAinS with existing epigenomic datasets. I will be using data that has been published rather than processed data. This includes DA peaks from Calderon *et al.* and Ram-Mohan *et al.* and chromatin accessibility QTL (caQTL) from Calderon *et al.*

## Environment

In [51]:
if (!requireNamespace("GenomicRanges")) {
    BiocManager::install("GenomicRanges")
}

if (!requireNamespace("biomaRt")) {
    BiocManager::install("biomaRt")
}

Loading required namespace: biomaRt



In [52]:
library(tidyverse)
library(data.table)
library(GenomicRanges)
library(biomaRt)

setwd("~/eQTL_pQTL_Characterization/")

source("03_Functional_Interpretation/scripts/utils/ggplot_theme.R")

## Load Data

In [35]:
calderon.da.peaks <- fread("03_Functional_Interpretation/data/41588_2019_505_MOESM6_ESM")

In [36]:
calderon.da.peaks <- calderon.da.peaks %>%
    as.data.frame() %>%
    dplyr::mutate(chr=gsub("chr", "", gsub("_.*", "", peak_id))) %>%
    dplyr::mutate(start=as.numeric(sapply(strsplit(peak_id, "_"), function(x) { x[2] }))) %>%
    dplyr::mutate(end=as.numeric(sapply(strsplit(peak_id, "_"), function(x) { x[3] }))) %>%
    dplyr::mutate(cell_type=gsub("_S-.*", "", contrast)) %>%
    dplyr::mutate(peak_id=paste0(chr, ":", start, "-", end)) %>%
    dplyr::mutate(chr = as.numeric(chr)) %>%
    dplyr::filter(chr %in% 1:22) %>%
    dplyr::select(peak_id, chr, start, end, cell_type, logFC, pval=adj.P.Val)

In [37]:
head(calderon.da.peaks)

Unnamed: 0_level_0,peak_id,chr,start,end,cell_type,logFC,pval
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>
1,1:101414421-101416252,1,101414421,101416252,Bulk_B,-1.588574,5.706523e-06
2,1:101553654-101554433,1,101553654,101554433,Bulk_B,2.756835,4.359877e-06
3,1:101875468-101877095,1,101875468,101877095,Bulk_B,1.924191,0.0001650887
4,1:105307605-105308001,1,105307605,105308001,Bulk_B,2.668101,5.036673e-05
5,1:108073851-108074235,1,108073851,108074235,Bulk_B,-3.099729,0.002662072
6,1:108475984-108478550,1,108475984,108478550,Bulk_B,1.297943,0.006296819


In [38]:
calderon.ca.qtl <- fread("03_Functional_Interpretation/data/41588_2019_505_MOESM8_ESM")

In [39]:
calderon.ca.qtl <- calderon.ca.qtl %>%
    as.data.frame() %>%
    dplyr::mutate(chr=as.numeric(gsub("chr", "", chr))) %>%
    dplyr::filter(chr %in% 1:22) %>%
    dplyr::select(chr, pos, everything())

In [40]:
head(calderon.ca.qtl)

Unnamed: 0_level_0,chr,pos,Phenotype,dbSNP134_id,gwas_pvalue,PMID,TotalDiscoverySamples,donor,cell,stim,⋯,peak_id_atac,contrast_atac,logFC_atac,adj.P.Val_atac,nearby_de_gene_id,contrast_rna,logFC_rna,adj.P.Val_rna,tested_TF,ref_minus_alt_match
Unnamed: 0_level_1,<dbl>,<int>,<chr>,<int>,<dbl>,<int>,<int>,<int>,<chr>,<lgl>,⋯,<chr>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<dbl>,<dbl>,<chr>,<dbl>
1,1,204434927,Transmission distortion,12092943,9.141e-10,22377632,4728,1002,Mem_B,True,⋯,chr1_204434103_204436273,Mem_B_S-Mem_B_U,1.606535,0.0002127197,,,,,ENSG00000008196_LINE2_TFAP2B_D_N1,-5.1125152
2,1,204434927,Maternal transmission distortion,12092943,3.252e-09,22377632,4728,1002,Mem_B,True,⋯,chr1_204434103_204436273,Mem_B_S-Mem_B_U,1.606535,0.0002127197,,,,,ENSG00000008196_LINE2_TFAP2B_D_N1,-5.1125152
3,1,204434927,Transmission distortion,12092943,9.141e-10,22377632,4728,1002,Mem_B,True,⋯,chr1_204434103_204436273,Mem_B_S-Mem_B_U,1.606535,0.0002127197,,,,,ENSG00000008196_LINE2_TFAP2B_D_N1,-5.1125152
4,1,204434927,Maternal transmission distortion,12092943,3.252e-09,22377632,4728,1002,Mem_B,True,⋯,chr1_204434103_204436273,Mem_B_S-Mem_B_U,1.606535,0.0002127197,,,,,ENSG00000008196_LINE2_TFAP2B_D_N1,-5.1125152
5,15,45740392,Serum creatinine,9806699,2.56e-13,20383146,67093,1001,Myeloid_DCs,False,⋯,chr15_45739896_45742558,pDCs_U-Myeloid_DCs_U,-2.623013,6.295777e-06,,,,,ENSG00000008196_LINE2_TFAP2B_D_N1,0.6583003
6,15,45740392,Serum creatinine,9806699,2.56e-13,20383146,67093,1002,Myeloid_DCs,False,⋯,chr15_45739896_45742558,pDCs_U-Myeloid_DCs_U,-2.623013,6.295777e-06,,,,,ENSG00000008196_LINE2_TFAP2B_D_N1,0.6583003


The supplementary table from Ram-Mohan *et al.* are in Excel format. The Excel spreadsheet has 9 sheets.

In [41]:
sheets <- readxl::excel_sheets("03_Functional_Interpretation/data/Ram_Mohan_et_al_2022_Table_S1.xlsx")

ram.mohan.da.peaks <- lapply(sheets, function(sheet) {
    readxl::read_xlsx("03_Functional_Interpretation/data/Ram_Mohan_et_al_2022_Table_S1.xlsx", sheet=sheet) %>%
        dplyr::mutate(Stimulation=gsub("-DR", "", sheet))
}) %>%
    do.call(rbind, .)

In [44]:
ram.mohan.da.peaks <- ram.mohan.da.peaks %>%
    dplyr::mutate(chr=gsub("chr", "", Chromosome)) %>%
    dplyr::mutate(peak_id=paste0(chr, ":", Start, "-", End)) %>%
    dplyr::filter(chr %in% as.character(1:22)) %>%
    dplyr::mutate(chr = as.numeric(chr)) %>%
    dplyr::select(peak_id, chr, start=Start, end=End, stimulation=Stimulation, logFC=log2FC, pval=`p-value`)

In [45]:
head(ram.mohan.da.peaks)

peak_id,chr,start,end,stimulation,logFC,pval
<chr>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>
1:756651-756892,1,756651,756892,BGP,1.35,0.032
1:1430944-1431234,1,1430944,1431234,BGP,2.48,0.00022
1:2090706-2090915,1,2090706,2090915,BGP,1.52,0.0358
1:2480184-2480571,1,2480184,2480571,BGP,1.15,0.0213
1:3593886-3594335,1,3593886,3594335,BGP,1.28,0.02
1:3658517-3658765,1,3658517,3658765,BGP,1.48,0.00517


In [77]:
cis.eqtl <- readRDS("~/gains_team282/eqtl/cisresults/conditionalanalysis/conditional_eQTL_results_final.rds")

In [78]:
head(cis.eqtl)

Unnamed: 0_level_0,SNP,Gene,eQTL_beta,eQTL_SE,pvalue,Number
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<int>
1,rs10753794,ENSG00000000457,0.0498587312408011,0.0070851585157937,5.19259471106013e-12,1
2,rs10919255,ENSG00000000460,-0.0726264894633498,0.0136853304567293,1.72815915933528e-07,1
3,rs77006036,ENSG00000000460,0.166236949102582,0.0334472538699365,8.47290238610159e-07,2
4,rs12406047,ENSG00000000971,0.17135374034767,0.0379807691167313,7.51523462853891e-06,1
5,rs6696136,ENSG00000001460,0.45884434698694,0.0871081626842823,1.94887370606405e-07,1
6,rs6676449,ENSG00000001460,-0.371531778822554,0.0880672792106829,2.8471651589813e-05,2


## SNP Positions in hg19

Both Calderon *et al.* and Ram-Mohan *et al.* use hg19 coordinates. Since we have the cis-eQTL SNPs and their Ref-Seq IDs, we can find their locations in hg19 for direct comparisons.

In [73]:
ensembl <- useEnsembl(biomart="snps", dataset="hsapiens_snp", version=75)

In [79]:
hg19.coords <- getBM(
    attributes=c("refsnp_id", "chr_name", "chrom_start"), 
    filters="snp_filter", 
    values=cis.eqtl$SNP, 
    mart=ensembl
)




                                                                      



In [92]:
cis.eqtl.hg19 <- merge(hg19.coords, cis.eqtl, by.x="refsnp_id", by.y="SNP") %>%
    dplyr::filter(chr_name %in% as.character(1:22)) %>%
    dplyr::mutate(chr=as.numeric(chr_name)) %>%
    dplyr::select(snp=refsnp_id, chr, position=chrom_start, gene=Gene, beta=eQTL_beta, se=eQTL_SE, pval=pvalue, number=Number) %>%
    dplyr::mutate(beta=as.numeric(beta)) %>%
    dplyr::mutate(se=as.numeric(se)) %>%
    dplyr::mutate(pval=as.numeric(pval))

## Overlaps with DA Peaks

### Overlap with Calderon *et al.*

In [198]:
cis.eqtl.ranges <- makeGRangesFromDataFrame(cis.eqtl.hg19, keep.extra.columns=TRUE, start.field="position", end.field="position")

In [199]:
da.peaks <- makeGRangesFromDataFrame(calderon.da.peaks, keep.extra.columns=TRUE)

In [200]:
overlaps <- findOverlaps(cis.eqtl.ranges, da.peaks)

In [201]:
calderon.overlaps <- cbind(
    as.data.frame(cis.eqtl.ranges[overlaps@from,]),
    as.data.frame(da.peaks[overlaps@to,])
) %>%
    as.data.frame() %>%
    dplyr::select(snp_chr=1, snp_pos_hg19=2, peak_chr=12, peak_start=13, peak_end=14, peak_pval=20, snp, gene, beta, se, pval, number, peak_id, cell_type, logFC) %>%
    dplyr::select(snp, snp_chr, snp_pos_hg19, eqtl_gene=gene, eqtl_beta=beta, eqtl_se=se, eqtl_pval=pval, eqtl_number=number, peak_id, peak_chr, peak_start, peak_end, cell_type, peak_logFC=logFC, peak_pval)

In [202]:
head(calderon.overlaps)

Unnamed: 0_level_0,snp,snp_chr,snp_pos_hg19,eqtl_gene,eqtl_beta,eqtl_se,eqtl_pval,eqtl_number,peak_id,peak_chr,peak_start,peak_end,cell_type,peak_logFC,peak_pval
Unnamed: 0_level_1,<chr>,<fct>,<int>,<chr>,<dbl>,<dbl>,<dbl>,<int>,<chr>,<fct>,<int>,<int>,<chr>,<dbl>,<dbl>
1,rs1005645,17,74696760,ENSG00000182534,-1.02246798,0.182494148,3.051127e-08,5,17:74695722-74697119,17,74695722,74697119,Effector_memory_CD8pos_T,-1.419568,0.0004001788
2,rs10099485,8,94938346,ENSG00000164951,0.04715347,0.009831835,2.07728e-06,1,8:94937667-94938491,8,94937667,94938491,Th17_precursors,2.720744,0.002540901
3,rs10109337,8,129000264,ENSG00000249859,-0.06597916,0.015019648,1.346039e-05,2,8:128999163-129001477,8,128999163,129001477,Bulk_B,1.706596,0.0006764532
4,rs10109337,8,129000264,ENSG00000249859,-0.06597916,0.015019648,1.346039e-05,2,8:128999163-129001477,8,128999163,129001477,Follicular_T_Helper,1.107581,0.007921767
5,rs10109337,8,129000264,ENSG00000249859,-0.06597916,0.015019648,1.346039e-05,2,8:128999163-129001477,8,128999163,129001477,Memory_Teffs,1.200628,0.006321209
6,rs10109337,8,129000264,ENSG00000249859,-0.06597916,0.015019648,1.346039e-05,2,8:128999163-129001477,8,128999163,129001477,Naive_B,2.149016,1.665642e-06


In [204]:
write.table(calderon.overlaps, "~/gains_team282/nikhil/functional_interpretation/cis_eqtl_calderon_et_al_da_peaks_overlaps.tsv", quote=F, row.names=F, sep="\t")

### Overlap with Ram-Mohan *et al.*

In [205]:
cis.eqtl.ranges <- makeGRangesFromDataFrame(cis.eqtl.hg19, keep.extra.columns=TRUE, start.field="position", end.field="position")

In [206]:
da.peaks <- makeGRangesFromDataFrame(ram.mohan.da.peaks, keep.extra.columns=TRUE)

In [207]:
overlaps <- findOverlaps(cis.eqtl.ranges, da.peaks)

In [208]:
ram.mohan.overlaps <- cbind(
    as.data.frame(cis.eqtl.ranges[overlaps@from,]),
    as.data.frame(da.peaks[overlaps@to,])
) %>%
    as.data.frame() %>%
    dplyr::select(snp_chr=1, snp_pos_hg19=2, peak_chr=12, peak_start=13, peak_end=14, peak_pval=20, snp, gene, beta, se, pval, number, peak_id, stimulation, logFC) %>%
    dplyr::select(snp, snp_chr, snp_pos_hg19, eqtl_gene=gene, eqtl_beta=beta, eqtl_se=se, eqtl_pval=pval, eqtl_number=number, peak_id, peak_chr, peak_start, peak_end, stimulation, peak_logFC=logFC, peak_pval)

In [209]:
head(ram.mohan.overlaps)

Unnamed: 0_level_0,snp,snp_chr,snp_pos_hg19,eqtl_gene,eqtl_beta,eqtl_se,eqtl_pval,eqtl_number,peak_id,peak_chr,peak_start,peak_end,stimulation,peak_logFC,peak_pval
Unnamed: 0_level_1,<chr>,<fct>,<int>,<chr>,<dbl>,<dbl>,<dbl>,<int>,<chr>,<fct>,<int>,<int>,<chr>,<dbl>,<dbl>
1,rs1010502,1,201123745,ENSG00000116857,0.4657811,0.0151438,1.0337130000000001e-128,1,1:201123305-201123828,1,201123305,201123828,EC1h,1.47,0.00414
2,rs1010858,2,39102671,ENSG00000163214,-0.11715346,0.01153675,1.2076650000000001e-22,1,2:39102462-39103410,2,39102462,39103410,HMGB,1.01,0.0254
3,rs1032763,5,17118930,ENSG00000271892,0.11174365,0.00818239,7.653472999999999e-38,1,5:17118788-17119060,5,17118788,17119060,EC1h,1.53,0.0311
4,rs10424044,19,52239177,ENSG00000171049,-0.08013458,0.01596276,6.739691e-07,1,19:52239083-52239807,19,52239083,52239807,R848,1.12,0.0335
5,rs1061307,3,146262344,ENSG00000188313,-0.299729,0.03060203,2.489119e-21,1,3:146262119-146262728,3,146262119,146262728,EC1h,1.12,0.0028
6,rs1061307,3,146262344,ENSG00000188313,-0.299729,0.03060203,2.489119e-21,1,3:146262129-146262646,3,146262129,146262646,EC4h,-1.13,0.0169


In [210]:
write.table(ram.mohan.overlaps, "~/gains_team282/nikhil/functional_interpretation/cis_eqtl_ram_mohan_et_al_da_peaks_overlaps.tsv", quote=F, row.names=F, sep="\t")

## Overlaps with caQTL

In [180]:
cis.eqtl.ranges <- makeGRangesFromDataFrame(cis.eqtl.hg19, keep.extra.columns=TRUE, start.field="position", end.field="position")

In [181]:
ca.qtl <- makeGRangesFromDataFrame(calderon.ca.qtl, keep.extra.columns=TRUE, start.field="pos", end.field="pos")

In [182]:
overlaps <- findOverlaps(cis.eqtl.ranges, ca.qtl)

In [214]:
ca.qtl.overlaps <- cbind(
    as.data.frame(cis.eqtl.ranges[overlaps@from,]),
    as.data.frame(ca.qtl[overlaps@to,])
) %>%
    as.data.frame() %>%
    dplyr::select(snp_chr=1, snp_pos_hg19=2, snp, cell, refAllele, altAllele, refCount, altCount, p, contrast_atac) %>%
    dplyr::select(snp, snp_chr, snp_pos_hg19, cell_type=cell, ref=refAllele, alt=altAllele, ref_count=refCount, alt_count=altCount, p, contrast_atac) %>%
    unique()

In [215]:
head(ca.qtl.overlaps)

Unnamed: 0_level_0,snp,snp_chr,snp_pos_hg19,cell_type,ref,alt,ref_count,alt_count,p,contrast_atac
Unnamed: 0_level_1,<chr>,<fct>,<int>,<chr>,<chr>,<chr>,<int>,<int>,<dbl>,<chr>
1,rs1010502,1,201123745,Regulatory_T,G,A,0,7,0.015625,Regulatory_T_S-Regulatory_T_U
2,rs1010858,2,39102671,Myeloid_DCs,G,A,7,1,0.0703125,Monocytes_U-Myeloid_DCs_U
3,rs1032763,5,17118930,Effector_memory_CD8pos_T,C,T,7,22,0.008130059,Effector_memory_CD8pos_T_S-Effector_memory_CD8pos_T_U
4,rs10424044,19,52239177,Th2_precursors,A,G,8,1,0.0390625,Th2_precursors_S-Th2_precursors_U
5,rs1061307,3,146262344,Effector_memory_CD8pos_T,A,G,1,8,0.0390625,Naive_CD8_T_U-Effector_memory_CD8pos_T_U
6,rs1061307,3,146262344,Naive_CD8_T,G,A,5,0,0.0625,Naive_CD8_T_U-Effector_memory_CD8pos_T_U


In [216]:
write.table(ca.qtl.overlaps, "~/gains_team282/nikhil/functional_interpretation/cis_eqtl_calderon_et_al_ca_qtl_overlaps.tsv", quote=F, row.names=F, sep="\t")