## QTL VEP Analyses

**Created**: 6 June 2022

## Environment

In [2]:
library(tidyverse)
library(data.table)
library(rtracklayer)
library(GenomicRanges)

setwd("~/eQTL_pQTL_Characterization/")

source("03_Functional_Interpretation/scripts/utils/ggplot_theme.R")

── [1mAttaching packages[22m ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.5     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.6     [32m✔[39m [34mdplyr  [39m 1.0.8
[32m✔[39m [34mtidyr  [39m 1.2.0     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 2.1.1     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()


Attaching package: ‘data.table’


The following objects are masked from ‘package:dplyr’:

    between, first, l

## Load Data

In [3]:
geno.bim <- fread("/nfs/users/nfs_n/nm18/gains_team282/Genotyping/All_genotyping_merged_filtered_b38_refiltered_rsID.bim")

colnames(geno.bim) <- c("chr", "snp", "cM", "pos", "minor_allele", "major_allele")

In [4]:
head(geno.bim)

chr,snp,cM,pos,minor_allele,major_allele
<chr>,<chr>,<int>,<int>,<chr>,<chr>
1,rs3131972,0,817341,A,G
1,rs546843995,0,818053,0,G
1,rs553916047,0,818359,0,A
1,1:818740_T_C,0,818740,T,C
1,rs145604921,0,819378,0,C
1,rs535256652,0,821053,0,T


In [5]:
cis.eqtl <- read.table("~/gains_team282/eqtl/cisresults/eigenMT/ciseqtl_eigenMT_corrected.txt") %>%
    dplyr::filter(Sig)

In [6]:
head(cis.eqtl)

Unnamed: 0_level_0,snps,gene,statistic,pvalue,beta,se,chr,SNPpos,TSS,BF,TESTS,BF.FDR,Sig,threshold
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<int>,<int>,<dbl>,<int>,<dbl>,<lgl>,<dbl>
22,rs3131972,ENSG00000237491,5.643405,2.400875e-08,0.07174216,0.01271257,1,817341,778747,7.058573e-06,294,1.993414e-05,True,9.170798e-05
23,rs3131972,ENSG00000230092,4.756587,2.329109e-06,0.06239499,0.0131176,1,817341,817712,0.0006917454,297,0.001593483,True,9.078164e-05
25,rs3131972,ENSG00000225880,6.292961,5.949542e-10,0.1001677,0.01591742,1,817341,827522,1.767014e-07,297,5.668498e-07,True,9.078164e-05
28,rs2272757,ENSG00000188976,-7.026045,4.904859e-12,-0.05107767,0.007269761,1,946247,959309,1.545031e-09,315,5.762103e-09,True,8.559411e-05
29,rs13303327,ENSG00000187961,-7.175012,1.845538e-12,-0.1351992,0.01884306,1,960326,960584,5.813445e-10,315,2.228468e-09,True,8.559411e-05
30,rs13303056,ENSG00000187583,-9.081031,1.922802e-18,-0.2963377,0.03263261,1,953778,966482,6.056826e-16,315,3.262796e-15,True,8.559411e-05


In [7]:
c.cis.eqtl <- readRDS("~/gains_team282/eqtl/cisresults/conditionalanalysis/conditional_eQTL_results_final.rds")

In [8]:
head(c.cis.eqtl)

Unnamed: 0_level_0,SNP,Gene,eQTL_beta,eQTL_SE,pvalue,Number
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<int>
1,rs10753794,ENSG00000000457,0.0498587312408011,0.0070851585157937,5.19259471106013e-12,1
2,rs10919255,ENSG00000000460,-0.0726264894633498,0.0136853304567293,1.72815915933528e-07,1
3,rs77006036,ENSG00000000460,0.166236949102582,0.0334472538699365,8.47290238610159e-07,2
4,rs12406047,ENSG00000000971,0.17135374034767,0.0379807691167313,7.51523462853891e-06,1
5,rs6696136,ENSG00000001460,0.45884434698694,0.0871081626842823,1.94887370606405e-07,1
6,rs6676449,ENSG00000001460,-0.371531778822554,0.0880672792106829,2.8471651589813e-05,2


In [9]:
eigengenes <- read.csv("~/gains_team282/nikhil/expression/gene_expression/eigengenes.multiple.csv", row.names=1)

In [10]:
n.modules <- ncol(eigengenes)

In [11]:
n.snps = system("head -n 1 /nfs/users/nfs_n/nm18/gains_team282/nikhil/data/genotypes/eigengene_sva_genotypes.raw | sed 's/ /\\n/g' | wc -l", intern=TRUE)
n.snps = as.numeric(n.snps) - 6

In [18]:
mqtl <- readRDS("~/gains_team282/nikhil/expression/eigengene_sva/mqtl.RDS") %>%
    dplyr::group_by(module, qtl.locus) %>%
    dplyr::slice_min(p, n=1, with_ties=F)

In [19]:
dim(module.qtl)

In [14]:
cis.pqtl <- readRDS("/nfs/users/nfs_n/nm18/gains_team282/proteomics/pqtl/pqtl_ms2019/cis_pqtl_all.RDS") %>%
    dplyr::filter(pQTL_pval < (5e-8) / 269) %>%
    dplyr::group_by(Gene) %>%
    dplyr::slice_min(pQTL_pval, n=1)

In [15]:
trans.pqtl <- readRDS("/nfs/users/nfs_n/nm18/gains_team282/proteomics/pqtl/pqtl_ms2019/trans_pqtl_all.RDS") %>%
    dplyr::filter(pQTL_pval < (5e-8) / 269) %>%
    dplyr::group_by(pQTL_Protein, pQTL_Locus) %>%
    dplyr::slice_min(pQTL_pval, n=1)

In [35]:
cis.eqtl.susie <- lapply(1:22, function(chr) {
    read.table(paste0("~/gains_team282/nikhil/colocalization/cis_eqtl/fine_mapping/SuSiE/full/full_chr", chr, "_credible_sets.tsv"), sep="\t", header=T)
}) %>% do.call(rbind, .) %>%
    as.data.frame() %>%
    dplyr::filter(is.na(Notes)) %>%
    dplyr::select(-Notes) %>%
    dplyr::select(Gene, SNP, SNP_Prob, Credible_Set)

In [16]:
c.cis.eqtl.susie <- lapply(1:22, function(chr) {
    read.table(paste0("~/gains_team282/nikhil/colocalization/cis_eqtl/fine_mapping/SuSiE/conditional/conditional_chr", chr, "_credible_sets.tsv"), sep="\t", header=T)
}) %>% do.call(rbind, .) %>%
    as.data.frame() %>%
    dplyr::filter(is.na(Notes)) %>%
    dplyr::select(-Notes) %>%
    dplyr::mutate(Signal=as.numeric(gsub("^.*-", "", Gene))) %>%
    dplyr::mutate(Gene=gsub("-.*$", "", Gene)) %>%
    dplyr::select(Gene, Signal, SNP, SNP_Prob, Credible_Set)

In [43]:
cis.pqtl.susie <- read.table("~/gains_team282/nikhil/colocalization/pqtl/fine_mapping/SuSiE/cis_pqtl_credible_sets.tsv", sep="\t", header=T) %>%
    dplyr::filter(is.na(Notes)) %>%
    dplyr::select(-Notes)

In [45]:
trans.pqtl.susie <- read.table("~/gains_team282/nikhil/colocalization/pqtl/fine_mapping/SuSiE/trans_pqtl_credible_sets.tsv", sep="\t", header=T) %>%
    dplyr::filter(is.na(Notes)) %>%
    dplyr::select(-Notes)

In [47]:
mqtl.susie <- read.table("~/gains_team282/nikhil/colocalization/mqtl/fine_mapping/SuSiE/module_qtl_credible_sets.tsv", sep="\t", header=T) %>%
    dplyr::filter(is.na(Notes)) %>%
    dplyr::select(-Notes)

## Prepare Data as VCF File

Start by adding all lead cis-eQTL and conditional cis-eQTL.

In [69]:
all.snps <- union(cis.eqtl$snps, c.cis.eqtl$SNP)

In [70]:
length(all.snps)

Add all lead module QTL.

In [71]:
all.snps <- union(all.snps, module.qtl$snp)

In [72]:
length(all.snps)

Add all lead pQTL.

In [73]:
all.snps <- union(all.snps, cis.pqtl$SNP)
all.snps <- union(all.snps, trans.pqtl$SNP)

In [74]:
length(all.snps)

In [26]:
vcf.contents <- geno.bim %>%
    dplyr::mutate(snp=gsub(":", "\\.", snp)) %>%
    dplyr::filter(snp %in% all.snps) %>%
    dplyr::mutate(chr = paste0("chr", chr)) %>%
    dplyr::select(chr, pos, snp, major_allele, minor_allele)

In [27]:
head(vcf.contents)

chr,pos,snp,major_allele,minor_allele
<chr>,<int>,<chr>,<chr>,<chr>
chr1,817341,rs3131972,G,A
chr1,901149,rs28731045,C,G
chr1,918014,rs142336952,G,A
chr1,946247,rs2272757,A,G
chr1,950296,rs4970377,A,C
chr1,953778,rs13303056,C,G


In [28]:
fwrite(vcf.contents, "/nfs/users/nfs_n/nm18/gains_team282/epigenetics/vep/qtl.vcf", sep="\t", col.names=F)

In [29]:
vcf.contents.both.ref <- dplyr::bind_rows(
    vcf.contents,
    vcf.contents %>%
        dplyr::mutate(temp_allele=major_allele) %>%
        dplyr::mutate(major_allele=minor_allele) %>%
        dplyr::mutate(minor_allele=temp_allele) %>%
        dplyr::select(everything(), -temp_allele)
) %>%
    dplyr::arrange(chr, pos)

In [30]:
head(vcf.contents.both.ref)

chr,pos,snp,major_allele,minor_allele
<chr>,<int>,<chr>,<chr>,<chr>
chr1,817341,rs3131972,G,A
chr1,817341,rs3131972,A,G
chr1,901149,rs28731045,C,G
chr1,901149,rs28731045,G,C
chr1,918014,rs142336952,G,A
chr1,918014,rs142336952,A,G


In [31]:
fwrite(vcf.contents.both.ref, "/nfs/users/nfs_n/nm18/gains_team282/epigenetics/vep/qtl_both_ref.vcf", sep="\t", col.names=F)

## Liftover for hg19 Tools

In [49]:
chain <- import.chain("03_Functional_Interpretation/data/hg38ToHg19.over.chain")

In [50]:
vcf.contents.gr <- vcf.contents %>%
    makeGRangesFromDataFrame(seqnames.field="chr", start.field="pos", end.field="pos", keep.extra.columns=TRUE)

In [51]:
vcf.contents.gr

GRanges object with 15992 ranges and 3 metadata columns:
          seqnames    ranges strand |         snp major_allele minor_allele
             <Rle> <IRanges>  <Rle> | <character>  <character>  <character>
      [1]     chr1    817341      * |   rs3131972            G            A
      [2]     chr1    901149      * |  rs28731045            C            G
      [3]     chr1    918014      * | rs142336952            G            A
      [4]     chr1    946247      * |   rs2272757            A            G
      [5]     chr1    950296      * |   rs4970377            A            C
      ...      ...       ...    ... .         ...          ...          ...
  [15988]    chr22  50589773      * |   rs6010023            C            T
  [15989]    chr22  50625049      * |   rs6151429            T            C
  [15990]    chr22  50626933      * |   rs6151415            C            A
  [15991]    chr22  50627096      * |   rs6151413            C            A
  [15992]    chr22  50627172   

In [52]:
seqlevelsStyle(vcf.contents.gr) = "UCSC"
vcf.contents.gr.hg19 <- unlist(liftOver(vcf.contents.gr, chain))

In [53]:
vcf.contents.hg19 <- as.data.frame(vcf.contents.gr.hg19) %>%
    dplyr::select(seqnames, start, snp, major_allele, minor_allele)

In [54]:
head(vcf.contents.hg19)

Unnamed: 0_level_0,seqnames,start,snp,major_allele,minor_allele
Unnamed: 0_level_1,<fct>,<int>,<chr>,<chr>,<chr>
1,chr1,752721,rs3131972,G,A
2,chr1,836529,rs28731045,C,G
3,chr1,853394,rs142336952,G,A
4,chr1,881627,rs2272757,A,G
5,chr1,885676,rs4970377,A,C
6,chr1,889158,rs13303056,C,G


In [55]:
fwrite(vcf.contents.hg19, "/nfs/users/nfs_n/nm18/gains_team282/epigenetics/vep/qtl_hg19.vcf", sep="\t", col.names=F)

## VCF with Credible Set SNPs

Some tools cannot handle a lot of SNPs (such as Expecto), so only lead SNPs were used. We can also include credible set SNPs to see if the credible set SNPs identify better or more concordant effects.

In [75]:
length(all.snps)

In [76]:
all.snps <- union(all.snps, cis.eqtl.susie$SNP)
all.snps <- union(all.snps, c.cis.eqtl.susie$SNP)

In [77]:
length(all.snps)

In [78]:
all.snps <- union(all.snps, cis.pqtl.susie$SNP)
all.snps <- union(all.snps, trans.pqtl.susie$SNP)

In [79]:
length(all.snps)

In [80]:
all.snps <- union(all.snps, mqtl.susie$SNP)

In [81]:
length(all.snps)

In [82]:
vcf.contents <- geno.bim %>%
    dplyr::mutate(snp=gsub(":", "\\.", snp)) %>%
    dplyr::filter(snp %in% all.snps) %>%
    dplyr::mutate(chr = paste0("chr", chr)) %>%
    dplyr::select(chr, pos, snp, major_allele, minor_allele)

In [83]:
head(vcf.contents)

chr,pos,snp,major_allele,minor_allele
<chr>,<int>,<chr>,<chr>,<chr>
chr1,817341,rs3131972,G,A
chr1,901149,rs28731045,C,G
chr1,918014,rs142336952,G,A
chr1,944296,rs6605067,A,G
chr1,944307,rs2839,C,T
chr1,946247,rs2272757,A,G


In [84]:
fwrite(vcf.contents, "/nfs/users/nfs_n/nm18/gains_team282/epigenetics/vep/qtl_and_cs.vcf", sep="\t", col.names=F)

In [85]:
vcf.contents.both.ref <- dplyr::bind_rows(
    vcf.contents,
    vcf.contents %>%
        dplyr::mutate(temp_allele=major_allele) %>%
        dplyr::mutate(major_allele=minor_allele) %>%
        dplyr::mutate(minor_allele=temp_allele) %>%
        dplyr::select(everything(), -temp_allele)
) %>%
    dplyr::arrange(chr, pos)

In [86]:
head(vcf.contents.both.ref)

chr,pos,snp,major_allele,minor_allele
<chr>,<int>,<chr>,<chr>,<chr>
chr1,817341,rs3131972,G,A
chr1,817341,rs3131972,A,G
chr1,901149,rs28731045,C,G
chr1,901149,rs28731045,G,C
chr1,918014,rs142336952,G,A
chr1,918014,rs142336952,A,G


In [87]:
fwrite(vcf.contents.both.ref, "/nfs/users/nfs_n/nm18/gains_team282/epigenetics/vep/qtl_and_cs_both_ref.vcf", sep="\t", col.names=F)

## Liftover for hg19 Tools

In [88]:
chain <- import.chain("03_Functional_Interpretation/data/hg38ToHg19.over.chain")

In [89]:
vcf.contents.gr <- vcf.contents %>%
    makeGRangesFromDataFrame(seqnames.field="chr", start.field="pos", end.field="pos", keep.extra.columns=TRUE)

In [90]:
vcf.contents.gr

GRanges object with 254604 ranges and 3 metadata columns:
           seqnames    ranges strand |         snp major_allele minor_allele
              <Rle> <IRanges>  <Rle> | <character>  <character>  <character>
       [1]     chr1    817341      * |   rs3131972            G            A
       [2]     chr1    901149      * |  rs28731045            C            G
       [3]     chr1    918014      * | rs142336952            G            A
       [4]     chr1    944296      * |   rs6605067            A            G
       [5]     chr1    944307      * |      rs2839            C            T
       ...      ...       ...    ... .         ...          ...          ...
  [254600]    chr22  50627172      * |   rs6151412            G            A
  [254601]    chr22  50634656      * |  rs79086732            C            T
  [254602]    chr22  50636969      * | rs151207005            G            C
  [254603]    chr22  50641305      * |  rs76300267            A            G
  [254604]    chr2

In [91]:
seqlevelsStyle(vcf.contents.gr) = "UCSC"
vcf.contents.gr.hg19 <- unlist(liftOver(vcf.contents.gr, chain))

In [92]:
vcf.contents.hg19 <- as.data.frame(vcf.contents.gr.hg19) %>%
    dplyr::select(seqnames, start, snp, major_allele, minor_allele)

In [93]:
head(vcf.contents.hg19)

Unnamed: 0_level_0,seqnames,start,snp,major_allele,minor_allele
Unnamed: 0_level_1,<fct>,<int>,<chr>,<chr>,<chr>
1,chr1,752721,rs3131972,G,A
2,chr1,836529,rs28731045,C,G
3,chr1,853394,rs142336952,G,A
4,chr1,879676,rs6605067,A,G
5,chr1,879687,rs2839,C,T
6,chr1,881627,rs2272757,A,G


In [94]:
fwrite(vcf.contents.hg19, "/nfs/users/nfs_n/nm18/gains_team282/epigenetics/vep/qtl_and_cs_hg19.vcf", sep="\t", col.names=F)