# Common SNPs per gene in ExAC

In [1]:
# url = 'ftp://ftp.broadinstitute.org/pub/ExAC_release/release0.3/ExAC.r0.3.sites.vep.vcf.gz'
# ! wget --timestamping --no-verbose --directory-prefix download {url}

In [2]:
import re
import numpy
import pandas
import vcf

## Helper functions

In [3]:
pattern = re.compile('ENSG[0-9]+')

def genes_in_csq(csq):
    ensembl_genes = set()
    for x in csq:
        ids = re.findall(pattern, x)
        ensembl_genes.update(ids)
    return ensembl_genes

In [4]:
def get_allele_frequency(x):
    """x is list or array"""
    x = numpy.array(x)
    return x[(numpy.abs(x - 0.5)).argmin()]

## Parse

In [5]:
# Options
path = 'download/ExAC.r0.3.sites.vep.vcf.gz'
maj_af_min = 0.05

In [6]:
rows = []
bed_rows = []
for r in vcf.Reader(filename=path, prepend_chr=True):
    
    # Quality control
    if r.FILTER:
        continue
    
    # Exclude non-SNPs
    if not r.is_snp:
        continue
    
    # Major allele frequency check
    allele_freq = get_allele_frequency(r.INFO['AF'])
    if not ((allele_freq >= maj_af_min) and
            (allele_freq <= 1 - maj_af_min)):
        continue

    # Add to bed rows
    bed_rows.append((r.CHROM, r.POS - 1, r.POS))
    
    # Extract genes
    genes = genes_in_csq(r.INFO.get('CSQ', []))

    # Add to rows
    row = r.CHROM, r.POS, allele_freq
    for gene in genes:
        rows.append(row + (gene,))

exac_df = pandas.DataFrame(rows, columns=['chromosome', 'position', 'allele_freq', 'ensembl_gene_id'])
bed_df = pandas.DataFrame(bed_rows, columns=['chrom', 'chromStart', 'chromEnd'])

In [7]:
exac_df.head(3)

Unnamed: 0,chromosome,position,allele_freq,ensembl_gene_id
0,chr1,69270,0.681,ENSG00000186092
1,chr1,69511,0.894,ENSG00000186092
2,chr1,69761,0.113,ENSG00000186092


In [8]:
bed_df.head(3)

Unnamed: 0,chrom,chromStart,chromEnd
0,chr1,69269,69270
1,chr1,69510,69511
2,chr1,69760,69761


In [9]:
len(exac_df)

97165

In [10]:
len(bed_df)

82728

In [11]:
exac_df.to_csv('data/exac-filtered.tsv', index=False, sep='\t')

In [12]:
bed_df.to_csv('data/exac-filtered.bed', index=False, header=False, sep='\t')

In [13]:
# Read Ensembl to Entrez gene mapping
url = 'https://raw.githubusercontent.com/dhimmel/entrez-gene/6e133f9ef8ce51a4c5387e58a6cc97564a66cec8/data/xrefs-human.tsv'
entrez_map_df = pandas.read_table(url)
entrez_map_df = entrez_map_df.query("resource == 'Ensembl'")
entrez_map_df = entrez_map_df[['GeneID', 'identifier']]
entrez_map_df = entrez_map_df.rename(columns={'GeneID': 'entrez_gene_id', 'identifier': 'ensembl_gene_id'})
entrez_map_df.head(2)

Unnamed: 0,entrez_gene_id,ensembl_gene_id
2,1,ENSG00000121410
7,2,ENSG00000175899


In [14]:
exac_df = exac_df.merge(entrez_map_df)

In [15]:
def get_snps_per_gene(df):
    df = df.drop_duplicates(['chromosome', 'position'])
    return pandas.Series({'snps': len(df)})

count_df = exac_df.groupby('entrez_gene_id').apply(get_snps_per_gene).reset_index()
count_df.head(2)

Unnamed: 0,entrez_gene_id,snps
0,1,4
1,2,11


In [16]:
count_df.to_csv('data/exac-counts.tsv', index=False, sep='\t')