# Common SNPs per gene in ExAC

In [2]:
# url = 'ftp://ftp.broadinstitute.org/pub/ExAC_release/release0.3/ExAC.r0.3.sites.vep.vcf.gz'
# ! wget --timestamping --no-verbose --directory-prefix download {url}

2015-08-18 11:00:59 URL: ftp://ftp.broadinstitute.org/pub/ExAC_release/release0.3/ExAC.r0.3.sites.vep.vcf.gz [1102] -> "download/.listing" [1]
2015-08-18 11:26:00 URL: ftp://ftp.broadinstitute.org/pub/ExAC_release/release0.3/ExAC.r0.3.sites.vep.vcf.gz [3176043421] -> "download/ExAC.r0.3.sites.vep.vcf.gz" [1]


In [74]:
import csv
import gzip
import re

import numpy
import pandas

In [61]:
columns = [
    'CHROM',
    'POS',
    'ID',
    'REF',
    'ALT',
    'QUAL',
    'FILTER',
    'INFO',
]

unkown = [
    'Allele',
    'Gene',
    'Feature',
    'Feature_type',
    'Consequence',
    'cDNA_position',
    'CDS_position',
    'Protein_position',
    'Amino_acids',
    'Codons',
    'Existing_variation',
    'ALLELE_NUM',
    'DISTANCE',
    'STRAND',
    'SYMBOL',
    'SYMBOL_SOURCE',
    'HGNC_ID',
    'BIOTYPE',
    'CANONICAL',
    'CCDS',
    'ENSP',
    'SWISSPROT',
    'TREMBL',
    'UNIPARC',
    'SIFT',
    'PolyPhen',
    'EXON',
    'INTRON',
    'DOMAINS',
    'HGVSc',
    'HGVSp',
    'GMAF',
    'AFR_MAF',
    'AMR_MAF',
    'ASN_MAF',
    'EUR_MAF',
    'AA_MAF',
    'EA_MAF',
    'CLIN_SIG',
    'SOMATIC',
    'PUBMED',
    'MOTIF_NAME',
    'MOTIF_POS',
    'HIGH_INF_POS',
    'MOTIF_SCORE_CHANGE',
    'LoF_info',
    'LoF_flags',
    'LoF_filter',
    'LoF',
]

In [63]:
def exac_row_generator(path):
    """Parse ExAC VCF. We are unsure of the format so this is a very jerry-rigged parser."""
    read_file = gzip.open(path, 'rt')
    for line in read_file:
        if line.startswith('#'):
            continue
        line = line.rstrip()
        row = line.split('\t')
        top_dict = dict(zip(columns, row))
        info_dict = top_dict['INFO'].split('|')
        semicolon_dict = dict()
        for kvpair in info_dict[0].split(';'):
            try:
                key, value = kvpair.split('=')
                value = value.split(',')
                semicolon_dict[key] = value
            except ValueError:
                pass
        yield top_dict, semicolon_dict
    read_file.close()

In [70]:
def get_allele_frequency(x):
    """x is list or array"""
    x = numpy.array(x)
    return x[(numpy.abs(x - 0.5)).argmin()]

In [78]:
maj_af_min = 0.05
path = 'download/ExAC.r0.3.sites.vep.vcf.gz'
rows = []
for i, (top_dict, semicolon_dict) in enumerate(exac_row_generator(path)):

    # Quality control
    if top_dict['FILTER'] != 'PASS':
        continue

    # Major allele frequency check
    allele_freqs = [float(af) for af in semicolon_dict['AF']]
    allele_freq = get_allele_frequency(allele_freqs)
    if not ((allele_freq >= maj_af_min) and (allele_freq <= 1 - maj_af_min)):
        continue
    
    # Extract ensembl genes and append row
    for gene in set(re.findall('ENSG[0-9]+', top_dict['INFO'])):
        row = top_dict['CHROM'], top_dict['POS'], allele_freq, gene
        rows.append(row)

exac_df = pandas.DataFrame(rows, columns=['chromosome', 'position', 'allele_freq', 'ensembl_gene_id'])

In [79]:
exac_df.head(3)

Unnamed: 0,chromosome,position,allele_freq,ensembl_gene_id
0,1,13417,0.067,ENSG00000223972
1,1,69270,0.681,ENSG00000186092
2,1,69511,0.894,ENSG00000186092


In [80]:
len(exac_df)

109473

In [86]:
exac_df.to_csv('data/exac-filtered.tsv', index=False, sep='\t')

In [82]:
# Read Ensembl to Entrez gene mapping
url = 'https://raw.githubusercontent.com/dhimmel/entrez-gene/6e133f9ef8ce51a4c5387e58a6cc97564a66cec8/data/xrefs-human.tsv'
entrez_map_df = pandas.read_table(url)
entrez_map_df = entrez_map_df.query("resource == 'Ensembl'")
entrez_map_df = entrez_map_df[['GeneID', 'identifier']]
entrez_map_df = entrez_map_df.rename(columns={'GeneID': 'entrez_gene_id', 'identifier': 'ensembl_gene_id'})
entrez_map_df.head(2)

Unnamed: 0,entrez_gene_id,ensembl_gene_id
2,1,ENSG00000121410
7,2,ENSG00000175899


In [88]:
exac_df = exac_df.merge(entrez_map_df)

In [97]:
def get_snps_per_gene(df):
    df = df.drop_duplicates(['chromosome', 'position'])
    return pandas.Series({'snps': len(df)})

count_df = exac_df.groupby('entrez_gene_id').apply(get_snps_per_gene).reset_index()
count_df.head(2)

Unnamed: 0,entrez_gene_id,snps
0,1,5
1,2,13


In [98]:
count_df.to_csv('data/exac-counts.tsv', index=False, sep='\t')