# GWAS Analysis

I want to see whether the eQTLs are enriched for GWAS hits.

In [19]:
import os
import subprocess

import pandas as pd
import pybedtools as pbt

import cardipspy as cpy
import ciepy

%matplotlib inline

In [2]:
import socket
if socket.gethostname() == 'fl1':
    pbt.set_tempdir('/projects/cdeboever/pybedtools_temp')

In [3]:
outdir = os.path.join(ciepy.root, 'output',
                      'gwas_analysis')
cpy.makedir(outdir)

private_outdir = os.path.join(ciepy.root, 'private_output',
                              'gwas_analysis')
cpy.makedir(private_outdir)

In [4]:
gwas = pd.read_table(cpy.gwas_catalog, low_memory=False)
gwas = gwas.dropna(subset=['CHR_ID', 'CHR_POS'])
gwas['CHR_ID'] = 'chr' + gwas.CHR_ID.astype(int).astype(str)
gwas['CHR_POS'] = gwas.CHR_POS.astype(int)
gwas['hg38_loc'] = gwas.CHR_ID + ':' + gwas.CHR_POS.astype(str)

## Convert GWAS Coordinates to hg19

The latest GWAS catalog from EBI has coordinates in hg38, so I'll
use `liftOver` to get hg19 coordinates.

In [5]:
lines = '\n'.join(gwas.CHR_ID + '\t' + 
                  (gwas.CHR_POS - 1).astype(str) + 
                  '\t' + gwas.CHR_POS.astype(str)) + '\n'
hg38_bt = pbt.BedTool(lines, from_string=True).sort()
hg38_bed = os.path.join(outdir, 'hg38_gwas_snps.bed')
hg38_bt = hg38_bt.saveas(hg38_bed)

In [6]:
hg19_bed = os.path.join(outdir, 'hg19_gwas_snps.bed')
hg19_unmapped = os.path.join(outdir, 'hg19_gwas_snps_unmapped.txt')
chain = os.path.join(ciepy.root, 'output', 'input_data', 
                     'hg38ToHg19.over.chain')
c = '{} {} {} {} {}'.format(cpy.liftOver, hg38_bed, chain,
                            hg19_bed, hg19_unmapped)
subprocess.check_call(c, shell=True)

0

In [7]:
with open(hg19_unmapped) as f:
    missing = pbt.BedTool(''.join([x for x in f.readlines()[1::2]]),
                          from_string=True)
hg38_mapped = hg38_bt.subtract(missing)
hg19_mapped = pbt.BedTool(hg19_bed)

In [11]:
old_loc = []
for r in hg38_mapped:
    old_loc.append('{}:{}'.format(r.chrom, r.end))
new_loc = []
new_chrom = []
new_pos = []
for r in hg19_mapped:
    new_loc.append('{}:{}'.format(r.chrom, r.end))
    new_chrom.append(r.chrom)
    new_pos.append(r.end)

new_info = pd.DataFrame({'hg19_loc':new_loc, 
                         'hg19_chrom': new_chrom, 
                         'hg19_pos': new_pos}, index=old_loc)
gwas.index = gwas.hg38_loc
gwas = gwas.join(new_info)

## Get Null Set

I need to obtain a set of SNVs to calculate a null distribution with. 
These null SNVs should not be significant for an eQTL and should be
matched based on

* distance to TSS
* MAF
* LD

My matching is based Grubert et al. I'll stratify MAF from 0 to 0.5 in 
0.05 intervals. For TSS, I'll define the distribution by log distance 
from all SNPs to TSS and binned into 10 breaks. For LD, I'll calculate
log number of SNPs tagged by each variant within 1 Mb with $R^2 \geq 0.8$.