In [1]:
setwd("/projects/CARDIPS/analysis/epigenome_resource")
suppressPackageStartupMessages(source("analyses/jennifer/notebooks/functions.R"))
library(readxl)
set.seed(5366)

In [2]:
tissues = c("iPSC", "PPC", "CVPC")
analyses = c("eqtls", "caqtls", "haqtls")

# **Summarize and process GWAS colocalization results**

In [9]:
coloc_results_dir = "/projects/CARDIPS/analysis/epigenome_resource/analyses/tim/gwas_coloc2"

## **load downstream qtls**

In [7]:
qtls = read_xlsx("~/projects/Resource/RevisionTables/TableS4.xlsx") %>% 
                filter(QTL_Order == "Primary") %>% 
                mutate(tissue_element = paste(Tissue,Element_ID,sep="_"))



## **load manifest**

In [8]:
manifest_file = "analyses/tim/gwas_coloc/scripts/manifest_subset.txt"
manifest = fread(manifest_file, data.table = F)



## **combine GWAS coloc results**

In [10]:
summary = as.data.frame(rbindlist(lapply(tissues, function(t)
{
    as.data.frame(rbindlist(lapply(analyses, function(a)
    {
        file = paste(coloc_results_dir, "results", paste(paste(a, t, "summary", sep = "_"), "txt", sep = "."), sep = "/")

        if (file.exists(file))
        {
            message(paste(t, a),  appendLF = F)
            fread(file, data.table = F) 
        } else
        {
            message(paste("Missing", t, a))
        }
    })))
}))) %>% mutate(p.gwas = as.double(p.gwas), p.eqtl = as.double(p.eqtl)) %>% dplyr::rename(type = discovery_order) 
nrow(summary)

iPSC eqtls
iPSC caqtls
iPSC haqtls
PPC eqtls
PPC caqtls
Missing PPC haqtls

CVPC eqtls
CVPC caqtls
CVPC haqtls


In [11]:
summary = summary[ summary$trait_id %in% manifest$full_trait_id ,]
nrow(summary)

In [12]:
summary$tissue_element = paste(summary$tissue, summary$element_id,sep= "_")


## **add trait description**

In [13]:
summary2 = merge(summary %>% dplyr::rename(full_trait_id = trait_id), 
                 manifest[,c("trait_id", "full_trait_id", "description")], by = "full_trait_id", all.x = T)

In [14]:
nrow(summary2)


In [15]:
coloced = unique(summary2[ summary2$p.eqtl <= 5e-5 & summary2$p.gwas <= 5e-8 & summary2$topsnp_pp >= 0.01 & summary2$PP.H4.abf >= 0.8, 
                  c("analysis","tissue","element_id","topsnp")])
table(coloced$tissue, coloced$analysis)
# coloced

      
       caqtls eqtls haqtls
  CVPC    200   100    172
  iPSC    151   181      0
  PPC     158    95      0

## **add gene and peak coordinates**

In [16]:
### Paths to Element coordinates found on Figshare
element_info = fread("analyses/jennifer/summary_files/all.phenotype_info.txt", data.table = F)
gene_info = fread("eqtls/iPSC/input/phenotype_info.txt",sep="\t",data.table=F) [,c(1:4,6,5)]
colnames(gene_info) = colnames(element_info)

chip_peaks = as.data.frame(rbindlist(list(fread("haqtls/iPSC/input/phenotype_info.bed",sep="\t"),
                           fread("haqtls/CVPC/input/phenotype_info.bed",sep="\t")))) %>% mutate(element_name = V4, strand = "")
colnames(chip_peaks) = colnames(element_info)
atac_peaks = as.data.frame(rbindlist(list(fread("caqtls/iPSC/input/phenotype_info.bed",sep="\t"),
                           fread("caqtls/CVPC/input/phenotype_info.bed",sep="\t"),
                                         fread("caqtls/PPC/input/phenotype_info.bed",sep="\t")))) %>% mutate(element_name = V4, strand = "")
colnames(atac_peaks) = colnames(element_info)
element_info = as.data.frame(rbindlist(list(gene_info,chip_peaks,atac_peaks)))


In [20]:
summary3 = merge(element_info, summary2, by = "element_id", all.y = T)

# check that all elements have coordinates
summary3 %>% filter(is.na(element_start))

# check all rows are retained
nrow(summary3)

“number of rows of result is not a multiple of vector length (arg 2)”
“number of rows of result is not a multiple of vector length (arg 2)”
“number of rows of result is not a multiple of vector length (arg 2)”
“number of rows of result is not a multiple of vector length (arg 2)”


element_id,element_chr,element_start,element_end,element_name,element_strand,full_trait_id,analysis,tissue,qtl_id,⋯,se.eqtl,p.eqtl,beta.gwas,se.gwas,p.gwas,bonferroni.eqtl,cs_size,tissue_element,trait_id,description
<chr>,<chr>,<int>,<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<chr>,<chr>,<chr>


## **annotate each qtl with lead beta and snp.pp**

In [23]:
summary4 = summary3

summary5 = merge(summary4, 
                qtls %>% select(tissue_element, Cluster_ID, Tissue, Complexity,qtl_combo, Nominated), 
                by = c("tissue_element"), all.x = T)



In [24]:
a = summary5 %>% filter(is.na(Cluster_ID))

if (nrow(a) > 0)
{
    message("Error: Some QTLs do not have an assigned cluster ID")
} else
{
    message("Each QTL has an assigned cluster ID")
}

Each QTL has an assigned cluster ID



In [25]:
length(unique(qtls$Cluster_ID))
nrow(qtls[ qtls$Nominated == "TRUE",])

## **how many clusters do not have its nominated QTLs present**

In [26]:
# there might be some modules whose nominated QTLs were not tested for GWAS 
# but the other qtls in the modules were
clusters_tested = unique(summary5$Cluster_ID)
nom_clusters = unique(summary5[ summary5$Nominated == "TRUE",]$Cluster_ID)

message(paste("# clusters total:", length(unique(qtls$Cluster_ID))))
message(paste("# clusters tested total:", length(clusters_tested)))
message(paste("# clusters with its nominated QTL tested:", length(nom_clusters)))


# clusters total: 52374

# clusters tested total: 52225

# clusters with its nominated QTL tested: 52205



In [40]:
# clusters without nominated QTLs that coloc'd (very few)
a = summary5 %>% filter(!Cluster_ID %in% summary5[summary5$Nominated == "TRUE",]$Cluster_ID) %>% 
                    filter(PP.H4.abf >= 0.8 & topsnp_pp >= 0.01 & p.eqtl <= 5e-5 & p.gwas <= 5e-8)

message(paste("# clusters without nominated QTL that colocalized with GWAS:", length(unique(a$Cluster_ID))))


# clusters without nominated QTL that colocalized with GWAS: 3



## annotate which qtls colocalized?

In [36]:
summary5$coloc_gwas = ifelse(summary5$topsnp_pp >= 0.01 & 
                             summary5$PP.H4.abf >= 0.8 & 
                             summary5$p.eqtl <= 5e-5 & 
                             summary5$p.gwas <= 5e-8,
                             T, F) 

summary5$cluster_gwas = paste(summary5$Cluster_ID, summary5$full_trait_id)

message(paste("# of clusters that colocalized:", length(unique(summary5[summary5$Nominated == T & summary5$coloc_gwas == T,]$Cluster_ID))))


# of clusters that colocalized: 695



In [29]:
table(summary5$description, summary5$coloc_gwas)

                                     
                                      FALSE  TRUE
  birth weight (eur)                  30008    29
  Body mass index (BMI)               59348   330
  childhood obesity (eur)              4393     4
  fasting glucose                     42800    29
  HDL cholesterol                     59490   293
  I20 Angina pectoris                 31236    12
  I21 Acute myocardial infarction     25660    10
  I25 Chronic ischaemic heart disease 34423    22
  I48 Atrial fibrillation and flutter 28559    40
  LDL direct                          59667   116
  Multivariate Longevity              18906    16
  Pulse rate                          37237    65
  QRS duration                        18718    15
  type 2 diabetes                     49207   126
  Ventricular rate                    19309     3

## save all results

In [39]:
message("Saving..", appendLF = F)

file = "~/projects/Resource/Figshare/all.gwas_summary.2024_0925.txt"
## GWAS results uploaded to Figshare
fwrite(summary5, file, row.names = F, sep = "\t")
message(paste("Saved:", file), appendLF = F)

Saving..
Saved: ~/projects/Resource/Figshare/all.gwas_summary.2024_0925.txt
