### Load packages and functions

In [2]:
function_path="analyses/jennifer/notebooks/functions.R"
wd_path="/projects/CARDIPS/analysis/epigenome_resource"
setwd(wd_path)
source(function_path)
library(MotifDb)
library(seqLogo)
set.seed(5366)

In [3]:
library(readxl)


### Load QTL info and GWAS colocalization summaries

In [4]:

qtl_module_path = "~/projects/Resource/RevisionTables/TableS4.xlsx"#Supplemental Table 4
manifest_path = "analyses/tim/gwas_coloc/scripts/manifest_subset.txt" #Subsetted UK Biobank manifest
coloc_summary_path = "~/projects/Resource/Figshare/all.gwas_summary.2024_0925.txt" ### Found on Figshare
gene_info_path="/reference/private/Gencode.v44lift38/gene_info.txt"


In [5]:
mods = as.data.frame(read_xlsx(qtl_module_path) %>% filter(QTL_Order == "Primary"))

manifest = fread(manifest_path, data.table = F)

geneinfo = fread(gene_info_path)

gwas_coloc = fread("analyses/tim/gwas_coloc/results/all_results.2024_0911.txt", data.table = F) 
gwas_summary = fread(coloc_summary_path,sep="\t",data.table=F)
# ### this file contains a column with the path to QTL summary statistics used for GWAS coloc
# qtls = fread("analyses/tim/gwas_coloc/scripts/qtls_090424.txt",sep="\t",data.table=F) 


gwas_summary2 = merge(gwas_summary, manifest[,c("full_trait_id","description")])


In [7]:


mods$path = paste0(paste0(tolower(mods$QTL_Type),"s"),"/",
                  mods$Tissue,"/step_4/qtl_by_element/qtl/qtl.",mods$Element_ID,".txt")
mods$path = ifelse(mods$Regressed == "TRUE",gsub("step_4","step_5",mods$path),mods$path)


In [83]:
setdiff(gwas_summary$full_trait_id,manifest$full_trait_id)

### Get QTL Summary Statistics

In [8]:
length(unique(gwas_summary2$full_trait_id))


In [9]:
plink="/frazer01/software/plink-1.90b3x/plink"
basedir="/projects/CARDIPS/analysis/epigenome_resource"

get_qtl_sumstats = function(cluster, trait, tissue) {
    qtl4gwas = mods[ mods$Cluster_ID == cluster,]
    elements = qtl4gwas$Element_ID
    nominated = mods$Element_ID[ mods$Cluster_ID == cluster & mods$Nominated == "TRUE"]
    
    paths = qtl4gwas$path


    qtl_stats = rbindlist(lapply(paths,function(p){
        if(grepl("step_4",p)) {

            out = fread(p,sep="\t") %>% filter(type == 0 ) %>% select(chrom,pos,ref,alt,rsid,id, pval,element_id)
        } else {

            out = fread(p,sep="\t")  %>% select(chrom,pos,ref,alt,rsid,id, pval,element_id)
        }
        out$snp_id = gsub("VAR_","",out$id)
        return(out)
    }))

    topsnp = gsub("VAR_","",gwas_summary2$topsnp[ gwas_summary2$description %like% trait & gwas_summary2$element_id == nominated])

    qtl_stats$topsnp = ifelse(qtl_stats$snp_id == topsnp, "TRUE","FALSE")
    qtl_stats$snp_id = gsub("VAR_","",qtl_stats$id)

    
    return(qtl_stats)
}
cvpc274_stats = as.data.frame(get_qtl_sumstats("CVPC_274","QRS duration","CVPC"))
ppc122_stats = as.data.frame(get_qtl_sumstats("PPC_122","type 2 diabetes","PPC"))


### Extract region from GWAS summary statistics

In [24]:
tabix="/frazer01/software/htslib-1.9/tabix"

diabetes=paste0(basedir,"/analyses/jennifer/gwas_liftover/hg38_summary_statistics/Mahajan.NatGenet2018b.T2D.European_sorted.hg38.tsv.gz")
qrs=paste0(basedir,"/analyses/jennifer/gwas_liftover/hg38_summary_statistics/continuous-12340-both_sexes-irnt.meta.hg38.tsv.gz")

extract_gwas = function(qtl_stats,cluster,gwas_path ) {
    region=paste0(unique(qtl_stats$chrom),":",min(qtl_stats$pos),"-",max(qtl_stats$pos))
    message(region)
    outdir=paste0(basedir,"/analyses/tim/gwas_example/",cluster)
    outfile = paste0(outdir,"/gwas_stats.txt")
#     cmd = paste(tabix,
#                 "-s1 -b2 -e2 -h",
#                 gwas_path,
#                 region, ">", outfile)
    cols = fread(cmd = paste("gunzip -c",
                gwas_path, "| head -n 1"))
    gwas_stats = fread(cmd = paste(tabix,
                "-s1 -b2 -e2 -h",
                gwas_path,
                region))
    colnames(gwas_stats) = colnames(cols)
    gwas_stats$a2a1 = paste(gsub("chr","",gwas_stats$chr), gwas_stats$pos, gwas_stats$a2, gwas_stats$a1, sep="_")
    gwas_stats$a1a2 = paste(gsub("chr","",gwas_stats$chr), gwas_stats$pos, gwas_stats$a1, gwas_stats$a2, sep="_")
    snps = unique(c(qtl_stats$snp_id,gwas_stats$a2a1,gwas_stats$a1a2))
    
    writeLines(snps,paste0(outdir,"/snps.txt"))
    
    reference = paste(basedir,"/analyses/jennifer/gwas_independent/reference", paste0(unique(qtl_stats$chrom),".renamed"), sep = "/")
    cmd1 = paste(plink,
                "--extract", paste0(outdir,"/snps.txt"),
                "--bfile", reference,
                "--out", paste0(outdir,"/",cluster), "--make-bed")
    message(cmd1)
    system(cmd1)
    
    cmd2 = paste(plink,
                "--bfile", paste0(outdir,"/",cluster),
                "--r2 --ld-window-r2 0 --ld-window 99999 --ld-window-kb 10000",
                "--out", paste0(outdir,"/",cluster))
    
    message(cmd2)
    system(cmd2)
    return(gwas_stats)

}

t2d_stats = extract_gwas(ppc122_stats,"PPC_122",diabetes)
qrs_stats = extract_gwas(cvpc274_stats,"CVPC_274",qrs)



chr7:26831935-29180776

/frazer01/software/plink-1.90b3x/plink --extract /projects/CARDIPS/analysis/epigenome_resource/analyses/tim/gwas_example/PPC_122/snps.txt --bfile /projects/CARDIPS/analysis/epigenome_resource//analyses/jennifer/gwas_independent/reference/chr7.renamed --out /projects/CARDIPS/analysis/epigenome_resource/analyses/tim/gwas_example/PPC_122/PPC_122 --make-bed

/frazer01/software/plink-1.90b3x/plink --bfile /projects/CARDIPS/analysis/epigenome_resource/analyses/tim/gwas_example/PPC_122/PPC_122 --r2 --ld-window-r2 0 --ld-window 99999 --ld-window-kb 10000 --out /projects/CARDIPS/analysis/epigenome_resource/analyses/tim/gwas_example/PPC_122/PPC_122

chr13:73840707-74046101

/frazer01/software/plink-1.90b3x/plink --extract /projects/CARDIPS/analysis/epigenome_resource/analyses/tim/gwas_example/CVPC_274/snps.txt --bfile /projects/CARDIPS/analysis/epigenome_resource//analyses/jennifer/gwas_independent/reference/chr13.renamed --out /projects/CARDIPS/analysis/epigenome_resourc

In [25]:
qrs_stats$snp_id = qrs_stats$a1a2
qrs_stats2 = qrs_stats[,c("chr","pos","a1","a2","p","snp_id") ]
colnames(qrs_stats2) = c("chrom","pos","ref","alt","pval","snp_id")

## harmonize t2d stats
a2a1 = t2d_stats[ t2d_stats$a2a1 %in% ppc122_stats$snp_id,]
a2a1$snp_id = a2a1$a2a1
a2a1 = a2a1[,c("chr","pos","a2","a1","p","snp_id") ]
colnames(a2a1) = c("chrom","pos","ref","alt","pval","snp_id")
a1a2 = t2d_stats[ t2d_stats$a1a2 %in% ppc122_stats$snp_id,]
a1a2$snp_id = a1a2$a1a2
a1a2 = a1a2[,c("chr","pos","a1","a2","p","snp_id") ]
colnames(a1a2) = c("chrom","pos","ref","alt","pval","snp_id")

t2d_stats2 = rbind(a2a1, a1a2)

In [26]:
plot_manhattan = function(cluster, qtl_stats, gwas_stats,gwas_name) {
    cluster_ld = fread(paste0("analyses/tim/gwas_example/",cluster,"/",cluster,".ld"),sep=" ",data.table=F) %>% 
                    filter(SNP_A %in% gwas_stats$snp_id & SNP_B %in% gwas_stats$snp_id)
    
    snps       = fread(paste0("analyses/tim/gwas_example/",cluster,"/snps.txt"),sep=" ",data.table=F, header=F)
    topsnp     = unique(qtl_stats$snp_id[ qtl_stats$topsnp == "TRUE"])
    cluster_ld = cluster_ld[ cluster_ld$SNP_A == topsnp | cluster_ld$SNP_B == topsnp , ]
    cluster_ld$snp_id = ifelse(cluster_ld$SNP_A == topsnp , cluster_ld$SNP_B, cluster_ld$SNP_A)
    cluster_ld$R2[ cluster_ld$snp_id == topsnp] = 1


    cluster_ld = merge(qtl_stats,cluster_ld[,c("snp_id","R2")],by="snp_id", all.x=T)

    gwas_stats$element_id = gwas_name
    gwas_stats$type = "GWAS"
    
    qtl2bind = qtl_stats[,c("chrom","pos","ref","alt","pval","snp_id","element_id")]
    qtl2bind$type = ifelse(grepl("^E",qtl2bind$element_id), "eQTL",
                           ifelse(grepl("atac",qtl2bind$element_id), "caQTL","haQTL"))

    
    manhattan_df = as.data.frame(rbind(gwas_stats,qtl2bind))

    manhattan_df = unique(merge(manhattan_df,cluster_ld[,c("snp_id","R2")],by="snp_id"))
    manhattan_df$topsnp_pos = as.numeric(gsub("_","",str_extract(topsnp,"_[0-9]*_")))
    manhattan_df$R2[ manhattan_df$pos == manhattan_df$topsnp_pos ] = 1

    manhattan_df$LD_group = ifelse(manhattan_df$R2 <= 0.2, "0-0.2", NA)
    manhattan_df$LD_group = ifelse(manhattan_df$R2 > 0.2 & manhattan_df$R2 <= 0.4, "0.2-0.4", manhattan_df$LD_group)
    manhattan_df$LD_group = ifelse(manhattan_df$R2 > 0.4 & manhattan_df$R2 <= 0.6, "0.4-0.6", manhattan_df$LD_group)
    manhattan_df$LD_group = ifelse(manhattan_df$R2 > 0.6 & manhattan_df$R2 <= 0.8, "0.6-0.8", manhattan_df$LD_group)
    manhattan_df$LD_group = ifelse(manhattan_df$R2 > 0.8, "0.8-1.0", manhattan_df$LD_group)
    
    manhattan_df$color = ifelse(manhattan_df$LD_group == "0-0.2","darkblue",
                        ifelse(manhattan_df$LD_group == "0.2-0.4","#94cdd5",
                              ifelse(manhattan_df$LD_group == "0.4-0.6","green",
                                    ifelse(manhattan_df$LD_group == "0.6-0.8","orange","red"))))

   
   
    return(manhattan_df[ complete.cases(manhattan_df),])
}

cvpc274_man = plot_manhattan("CVPC_274", cvpc274_stats, qrs_stats2, "QRS Duration")
ppc122_man  = plot_manhattan("PPC_122", ppc122_stats, t2d_stats2, "Type 2 Diabetes")



In [27]:
fwrite(cvpc274_man, "~/projects/Resource/iPSCORE_Multi-QTL_Resource/SourceData/SOURCEDATA.FIGURES11.txt",sep="\t",row.names=F,quote=F)
fwrite(ppc122_man, "~/projects/Resource/iPSCORE_Multi-QTL_Resource/SourceData/SOURCEDATA.FIGURE6C.txt",sep="\t",row.names=F,quote=F)

In [104]:
colnames(gwas_summary2)
gwas_summary2[ gwas_summary2$Cluster_ID == "CVPC_274" & gwas_summary2$description %like% "QRS", 
              c("element_id","topsnp","topsnp_pp","PP.H4.abf","Nominated")]

gwas_summary2[ gwas_summary2$Cluster_ID == "PPC_122" & gwas_summary2$description %like% "diabetes", 
              c("element_id","topsnp","topsnp_pp","PP.H4.abf","Nominated")]

gwas_summary2[ gwas_summary2$Cluster_ID == "PPC_122" & gwas_summary2$description %like% "diabetes", 
              c("element_id","topsnp","topsnp_pp","PP.H4.abf","Nominated")]

Unnamed: 0_level_0,element_id,topsnp,topsnp_pp,PP.H4.abf,Nominated
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<dbl>,<lgl>
175666,cvpc_chip_peak_17305,VAR_13_73937854_G_C,0.3182383,0.9926283,True
184774,cvpc_chip_peak_17304,VAR_13_73937854_G_C,0.3135846,0.9962955,False
186759,cvpc_atac_peak_73241,VAR_13_73937854_G_C,0.3737284,0.9963989,False
188050,cvpc_chip_peak_17303,VAR_13_73937854_G_C,0.3046845,0.7390419,False


Unnamed: 0_level_0,element_id,topsnp,topsnp_pp,PP.H4.abf,Nominated
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<dbl>,<lgl>
459762,ppc_atac_peak_244305,VAR_7_28152661_C_T,0.6292117,0.9723332,True
482757,ENSG00000153814.13,VAR_7_28152661_C_T,0.8067442,0.9799599,False
495838,ppc_atac_peak_244298,VAR_7_28152661_C_T,0.6141469,0.9464068,False
