In [1]:
import pandas

In [2]:
# Read EBI's GWAS Catalog with ontology annotations
path = 'download/gwas_catalog_v1.0.1-downloaded_2015-06-08.tsv.gz'
ebi_df = pandas.read_table(path, compression='gzip', low_memory=False)

In [3]:
ebi_df.head()

Unnamed: 0,DATE ADDED TO CATALOG,PUBMEDID,FIRST AUTHOR,DATE,JOURNAL,LINK,STUDY,DISEASE/TRAIT,INITIAL SAMPLE DESCRIPTION,REPLICATION SAMPLE DESCRIPTION,...,RISK ALLELE FREQUENCY,P-VALUE,PVALUE_MLOG,P-VALUE (TEXT),OR or BETA,95% CI (TEXT),PLATFORM [SNPS PASSING QC],CNV,MAPPED_TRAIT,MAPPED_TRAIT_URI
0,,25231870,Perry JR,23-Jul-2014,Nature,http://europepmc.org/abstract/MED/25231870,Parent-of-origin-specific allelic associations...,Menarche (age at onset),"Up to 182,413 European ancestry women",,...,0.47,2e-12,11.69897000433602,,0.04,[0.03-0.05] (unit increase),"Illumina & Affymetrix [2,441,815] (imputed)",N,age at menarche,http://www.ebi.ac.uk/efo/EFO_0004703
1,,25231870,Perry JR,23-Jul-2014,Nature,http://europepmc.org/abstract/MED/25231870,Parent-of-origin-specific allelic associations...,Menarche (age at onset),"Up to 182,413 European ancestry women",,...,0.29,7e-20,19.154901959985743,,0.05,[0.038-0.062] (unit increase),"Illumina & Affymetrix [2,441,815] (imputed)",N,age at menarche,http://www.ebi.ac.uk/efo/EFO_0004703
2,,25231870,Perry JR,23-Jul-2014,Nature,http://europepmc.org/abstract/MED/25231870,Parent-of-origin-specific allelic associations...,Menarche (age at onset),"Up to 182,413 European ancestry women",,...,0.29,7e-20,19.154901959985743,,0.05,[0.038-0.062] (unit increase),"Illumina & Affymetrix [2,441,815] (imputed)",N,age at menarche,http://www.ebi.ac.uk/efo/EFO_0004703
3,,25231870,Perry JR,23-Jul-2014,Nature,http://europepmc.org/abstract/MED/25231870,Parent-of-origin-specific allelic associations...,Menarche (age at onset),"Up to 182,413 European ancestry women",,...,0.46,3e-08,7.522878745280337,,0.03,[0.02-0.04] (unit increase),"Illumina & Affymetrix [2,441,815] (imputed)",N,age at menarche,http://www.ebi.ac.uk/efo/EFO_0004703
4,,25231870,Perry JR,23-Jul-2014,Nature,http://europepmc.org/abstract/MED/25231870,Parent-of-origin-specific allelic associations...,Menarche (age at onset),"Up to 182,413 European ancestry women",,...,0.46,3e-08,7.522878745280337,,0.03,[0.02-0.04] (unit increase),"Illumina & Affymetrix [2,441,815] (imputed)",N,age at menarche,http://www.ebi.ac.uk/efo/EFO_0004703


In [4]:
# Create a uri_df (for cross-references)
rows = list()
for uri in filter(pandas.notnull, set(ebi_df['MAPPED_TRAIT_URI'])):
    head, tail = uri.rsplit('/', 1)
    resource, resource_id = tail.split('_', 1)
    rows.append([uri, resource, resource_id])
    
uri_df = pandas.DataFrame(rows, columns=['MAPPED_TRAIT_URI', 'resource', 'resource_id'])

In [5]:
# Read DO Slim propagated cross-references
url = 'https://raw.githubusercontent.com/dhimmel/disease-ontology/72614ade9f1cc5a5317b8f6836e1e464b31d5587/data/xrefs-prop-slim.tsv'
doxref_df = pandas.read_table(url)

In [6]:
# Inner join the GWAS catalog with the DO slim mapping 
map_df = uri_df.merge(doxref_df)
map_df = map_df[['MAPPED_TRAIT_URI', 'doid_code', 'doid_name']]
ebi_df = ebi_df.merge(map_df)
len(ebi_df)

10342

In [7]:
# Show all ebi_df columns
ebi_df.columns

Index(['DATE ADDED TO CATALOG', 'PUBMEDID', 'FIRST AUTHOR', 'DATE', 'JOURNAL',
       'LINK', 'STUDY', 'DISEASE/TRAIT', 'INITIAL SAMPLE DESCRIPTION',
       'REPLICATION SAMPLE DESCRIPTION', 'REGION', 'CHR_ID', 'CHR_POS',
       'REPORTED GENE(S)', 'MAPPED_GENE', 'UPSTREAM_GENE_ID',
       'DOWNSTREAM_GENE_ID', 'SNP_GENE_IDS', 'UPSTREAM_GENE_DISTANCE',
       'DOWNSTREAM_GENE_DISTANCE', 'STRONGEST SNP-RISK ALLELE', 'SNPS',
       'MERGED', 'SNP_ID_CURRENT', 'CONTEXT', 'INTERGENIC',
       'RISK ALLELE FREQUENCY', 'P-VALUE', 'PVALUE_MLOG', 'P-VALUE (TEXT)',
       'OR or BETA', '95% CI (TEXT)', 'PLATFORM [SNPS PASSING QC]', 'CNV',
       'MAPPED_TRAIT', 'MAPPED_TRAIT_URI', 'doid_code', 'doid_name'],
      dtype='object')

In [9]:
# Write all lead SNPs for SNAP input
gwas_snps = set('rs{}'.format(x) for x in ebi_df.SNP_ID_CURRENT if pandas.notnull(x))
with open('data/snap/do-slim-lead-SNPs.txt', 'w') as write_file:
    write_file.write('\n'.join(gwas_snps))
len(gwas_snps)

5255