### Prepare tables to run fGWAS fine mapping using chromatin annotations using Bill's pipeline

In [3]:
suppressPackageStartupMessages(library("tidyr"))
options(scipen=999)

In [4]:
suppressPackageStartupMessages(library("stringr"))

In [5]:
home="/home/paola/Family1070/private_output/fgwas_analysis/PR_interval/preprocessing/"
setwd(home)

In [20]:
gwas = read.table("/publicdata/gwas_summary_stats_20180124/vanSetten_2018/PR_interval_July2018_summary_results.hg19.txt", 
                  header=T, sep="\t")

In [28]:
head(gwas)

SNPID,CHR,POS_hg18,A1,A2,F,Beta,SE,P_VALUE,POS,Z,N,POS_1
rs10,7,92221824,A,C,0.053,-0.4522,0.4288,0.29153,92383888,-1.0545709,.,92383887
rs1000000,12,125456933,A,G,0.224,0.1481,0.1377,0.28237,126890980,1.0755265,.,126890979
rs10000010,4,21227772,C,T,0.506,-0.1659,0.1177,0.15858,21618674,-1.4095157,.,21618673
rs10000012,4,1347325,G,C,0.137,0.0552,0.1652,0.73806,1357325,0.3341404,.,1357324
rs10000013,4,36901464,C,A,0.222,-0.179,0.1407,0.20309,37225069,-1.2722104,.,37225068
rs10000017,4,84997149,T,C,0.224,0.2565,0.1613,0.11182,84778125,1.5902046,.,84778124


In [22]:
colnames(gwas)= c("SNPID", "CHR", "POS_hg18", "A1", "A2", "F", "Beta", "SE", "P_VALUE", "POS")

In [23]:
required = c("SNPID", "CHR", "POS", "SE", "Z", "F", "N")

In [24]:
gwas$Z = gwas$Beta /gwas$SE
gwas$N = '.'

In [25]:
gwas$POS_1 = gwas$POS -1

In [26]:
sum(is.na(gwas$SNPID))

Annotate with regulatory regions and ASE

In [29]:
write.table(gwas[,c("CHR","POS_1","POS","SNPID")], "PR_Interval.bed", row.names=F, col.names=F, quote=F, sep="\t")

In [30]:
annotate_gwas = function( regiofile, annofile ){
    system(paste("bedtools intersect -a" ,  regiofile, "-b", annofile , "-wo > intesect_results"))
    annot = read.table("intesect_results")  
    annot = annot[!duplicated(annot),]
    annot_wide <- spread(annot[,c(1:4,8,9)], V8, V9)
    annot_wide [is.na(annot_wide)]<-0
    colnames(annot_wide)[1:4] = c("CHR", "START", "POS", "SNPID")
    return(annot_wide)
   }

In [31]:
anno = annotate_gwas('PR_Interval.bed', 'frazer_lab_cm_annotations.bed')

In [32]:
head(anno)

CHR,START,POS,SNPID,ATAC,H3K27AC,NKX25
7,40590568,40590569,rs1000013,0,1,0
4,7399668,7399669,rs10000132,0,1,0
4,77356245,77356246,rs10000169,0,1,0
3,176467877,176467878,rs1000021,0,0,1
4,7399791,7399792,rs10000236,1,1,1
4,53507944,53507945,rs10000311,0,1,0


In [33]:
m = merge(gwas, anno, by= c('CHR','POS', 'SNPID'), all.x=TRUE)

In [34]:
dim(anno)

In [35]:
m [is.na(m)]<-0

In [47]:
dim(m)

In [37]:
nkx_ase = readLines("NKX25_ASE_rsids.txt")
h3k_ase = readLines("H3K27AC_ASE_rsids.txt")

In [38]:
m$NKX25_ASE = as.numeric(m$SNPID %in% nkx_ase)
m$H3K27AC_ASE = as.numeric(m$SNPID %in% h3k_ase)

In [39]:
m = m[!duplicated(m),]

In [53]:
sum(duplicated(gwas$POS))

remove duplicated SNPs (merged / renames SNPs)

In [55]:
m = m[with(m, order(SE)),]
m = m[!duplicated( m[,c("CHR", "POS")]),]     
m = m[with(m, order(CHR, POS)),]

remove chr Y and M

In [61]:
m =subset(m, !(CHR %in% c("M", "Y")))

In [62]:
mori = subset(m, select = as.character(c("SNPID", "CHR", "POS", "P_VALUE" )))

In [63]:
write.table(mori, "../Original_input", col.names=T, row.names=F, sep="\t",quote=F)

In [64]:
mano = subset(m, select = c( required, 'ATAC','H3K27AC','NKX25','NKX25_ASE','H3K27AC_ASE' ))

In [65]:
write.table(mano, "Annotated_input", col.names=T, row.names=F, sep=" ",quote=F)

Remember to run the fgwas command to determine the size of the chunks for the analysis (-k)
for this analysis fgwas k = 910 ( 5,000/5.5 Mb)

### Read Results

In [67]:
getwd()

In [70]:
res = read.table('../pipeline_out/Final_Model/Final_Model.bfs', header=T)
seg = read.table('../pipeline_out/Final_Model/Final_Model.segbfs', header=T)
res = merge(res,seg[,c('chunk', 'PPA')], by="chunk" )
res$PPA = res$PPA.x *res$PPA.y
res = res[order(res$PPA, decreasing=T),]

In [72]:
nkx = subset(res, NKX25_ASE==1 )

In [75]:
head(nkx,15)

Unnamed: 0,chunk,id,chr,pos,logBF,Z,V,pi,pseudologPO,pseudoPPA,PPA.x,H3K27AC,NKX25_ASE,NKX25,ATAC,H3K27AC_ASE,PPA.y,PPA
2341288,2566,rs3807989,7,116186241,146.708,17.5306,0.0134792,-2.02771,144.24,1.0,0.999995,1,1,1,1,0,1.0,0.999995
597343,658,rs7132327,12,115381071,67.4388,12.0529,0.0170303,-4.52247,62.4012,1.0,0.969691,0,1,1,0,0,1.0,0.969691
1247554,1368,rs13006682,2,106104856,5.86094,4.08401,0.0184145,-2.34105,3.05718,0.955091,0.653746,1,1,1,1,0,0.986751,0.6450845
130962,144,rs1418191,1,173362457,3.11821,-3.26809,0.0332698,-2.03826,0.638889,0.654502,0.660973,1,1,1,1,0,0.846322,0.559396
1929874,2118,rs12332381,5,65814898,3.07196,3.21691,0.0137124,-1.90065,0.742694,0.677585,0.5691,1,1,1,1,0,0.879236,0.5003732
1985058,2179,rs10519804,5,124532423,3.16703,3.2643,0.0231344,-2.39752,0.303474,0.575291,0.594834,1,1,1,1,0,0.811354,0.4826209
1453875,1594,rs2830965,21,28847771,5.15492,-3.90072,0.0231344,-3.94986,0.695024,0.667084,0.536971,0,1,1,1,0,0.889835,0.4778156
1970936,2163,rs9326874,5,112420588,3.59564,-3.39832,0.014161,-2.68774,0.427587,0.605297,0.466696,1,1,1,1,0,0.87112,0.4065482
1499544,1645,rs2267469,22,43306428,4.08924,-3.65643,0.0534072,-3.26159,0.329126,0.581547,0.42138,1,1,1,0,0,0.873911,0.3682486
1640303,1797,rs7615524,3,139477714,2.63143,-3.05623,0.0173186,-2.15856,0.0223599,0.50559,0.457073,1,1,1,1,0,0.804987,0.3679378


There are several complelling example for novel sub-threshold loci that could be mediated by NKX2-5, the most intereting:
- rs13006682/ FHL2 : the gene has its highest expression in heart ventricle (GTEx) and linked to cardiac hyperthophy (KO mouce no phenotype though) - the SNP distrupt a tbx motif
- rs2267469/ PACSIN2 the gene colocalize with CAV1 in the caveolae, heart weight phenotype in the mice invenotry - the snp disrupt nkx2-5 and loops directly to the prmpter of PACSIN2

** Check the new snp/gene associations in CARDIPS **

There is no eQTL for these genes.