# Playing with the data from [Nelson et al](https://dx.doi.org/10.1038/ng.3314)

The support of human genetic evidence for approved drug indications.

In [1]:
import pandas

In [3]:
# Download data supplements
# for i in range(11, 15):
#     url = 'http://www.nature.com/ng/journal/v47/n8/extref/ng.3314-S{}.txt'.format(i)
#     ! wget --timestamping --directory-prefix download {url}

## Disease-target pairs

In [11]:
columns = ['gene_symbol', 'mesh_name', 'mesh_category', 'phase', 'approved']
target_df = pandas.read_table('download/ng.3314-S13.txt', names=columns, header=0)

In [12]:
target_df.head()

Unnamed: 0,gene_symbol,mesh_name,mesh_category,phase,approved
0,A2M,alzheimer disease,Nervous System Diseases,No Development Reported,False
1,A2M,brain ischemia,Nervous System Diseases,Discontinued,False
2,A2M,coronary restenosis,Cardiovascular Diseases,Discontinued,False
3,A2M,peripheral vascular diseases,Cardiovascular Diseases,Discontinued,False
4,A2M,thrombosis,Cardiovascular Diseases,Discontinued,False


In [21]:
target_df.query("approved and mesh_name == 'multiple sclerosis'")

Unnamed: 0,gene_symbol,mesh_name,mesh_category,phase,approved
3774,CNR1,multiple sclerosis,Nervous System Diseases,Launched,True
3813,CNR2,multiple sclerosis,Nervous System Diseases,Launched,True
8810,IFNAR2,multiple sclerosis,Nervous System Diseases,Launched,True
12801,NR3C1,multiple sclerosis,Nervous System Diseases,Launched,True


## Gene-disease associations

In [43]:
assoc_df = pandas.read_table('download/ng.3314-S12.txt')
assoc_df = assoc_df.rename(columns = {
    'Disease': 'gwasdb_name',
    'Link': 'link',
    'Source': 'source',
    'MSH': 'mesh_name',
    'MSH.Top': 'mesh_category',
    'Gene': 'gene_symbol',
    'GeneScore': 'score',
    'Rank': 'rank',
    'snp_id': 'lead_snp',
    'snp.ld': 'linked_snp',
    'SNP.Trait.Cnt': 'count',
})

In [33]:
assoc_df.columns

Index(['gwasdb_name', 'link', 'source', 'mesh_name', 'MSH.Top', 'gene_symbol',
       'score', 'rank', 'pval2', 'lead_snp', 'pvalue', 'count', 'linked_snp',
       'r2', 'eqtl', 'rdb', 'Cat.rdb', 'eCat', 'AAEffect', 'AAScore',
       'Gene.Trait.Cnt', 'OrphID'],
      dtype='object')

In [35]:
assoc_df.head().transpose()

Unnamed: 0,0,1,2,3,4
gwasdb_name,#104300 ALZHEIMER DISEASE; AD,#104300 ALZHEIMER DISEASE; AD,#104300 ALZHEIMER DISEASE; AD,#104300 ALZHEIMER DISEASE; AD,#104300 ALZHEIMER DISEASE; AD
link,OMIM:104300,OMIM:104300,OMIM:104300,OMIM:104300,OMIM:104300
source,Omim,Omim,Omim,Omim,Omim
mesh_name,alzheimer disease,alzheimer disease,alzheimer disease,alzheimer disease,alzheimer disease
MSH.Top,Nervous System Diseases,Nervous System Diseases,Nervous System Diseases,Nervous System Diseases,Nervous System Diseases
gene_symbol,ABCA2,ANAPC2,ARVCF,C8G,C9orf139
score,6,1,2,0,1
rank,1,2,3,6,3
pval2,1e-08,1e-08,1e-08,1e-08,1e-08
lead_snp,rs908832,rs908832,rs165599,rs908832,rs908832


In [38]:
assoc_df.source.value_counts()

GWAS:A             6214
GWAS:B             5602
OMIM               2797
JohnsonOdonnell     905
GWASCentral         790
Omim                301
dbGaP                43
dtype: int64

In [40]:
include_sources = {'GWAS:A', 'GWAS:B', 'JohnsonOdonnell', 'GWASCentral', 'Omim', 'dbGaP'}
assoc_df = assoc_df[assoc_df.source.isin(include_sources)]

In [44]:
assoc_df.query('mesh_name == "multiple sclerosis" and rank == 1')

Unnamed: 0,gwasdb_name,link,source,mesh_name,mesh_category,gene_symbol,score,rank,pval2,lead_snp,...,linked_snp,r2,eqtl,rdb,Cat.rdb,eCat,AAEffect,AAScore,Gene.Trait.Cnt,OrphID
8446,Multiple sclerosis,PUBMEDID:17660530,JohnsonOdonnell,multiple sclerosis,Nervous System Diseases,GPR149,1,1,5.39e-16,rs1356122,...,rs34792862,0.81,no,no,,9,,1.0,2,
8447,Multiple sclerosis,PUBMEDID:17660530,JohnsonOdonnell,multiple sclerosis,Nervous System Diseases,M6PR,2,1,4.89e-14,rs1805755,...,rs1805755,1.0,no,no,,9,,1.0,2,
8451,Multiple sclerosis,PUBMEDID:18997785,GWAS:A,multiple sclerosis,Nervous System Diseases,KIF1B,6,1,3e-10,rs10492972,...,rs10492972,1.0,yes,no,,2,,1.0,2,
8456,Multiple sclerosis,PUBMEDID:19525953,GWAS:A,multiple sclerosis,Nervous System Diseases,C1orf137,4,1,3e-10,rs2300747,...,rs6677309,0.922,no,yes,4,4,,1.0,8,
8458,Multiple sclerosis,PUBMEDID:19525953,GWAS:A,multiple sclerosis,Nervous System Diseases,CD6,4,1,3.79e-09,rs17824933,...,rs17824933,1.0,no,no,,9,,1.0,2,
8470,Multiple sclerosis,PUBMEDID:19525955,GWAS:A,multiple sclerosis,Nervous System Diseases,METTL21B,6,1,5e-11,rs703842,...,rs10877014,1.0,yes,no,,2,,1.0,4,
8474,Multiple sclerosis,PUBMEDID:19525955,GWAS:A,multiple sclerosis,Nervous System Diseases,SLC26A10,4,1,2.7e-10,rs10876994,...,rs1871417,0.704,yes,no,,2,,1.0,4,
8481,Multiple sclerosis,PUBMEDID:20159113,GWAS:A,multiple sclerosis,Nervous System Diseases,STAT3,4,1,3e-10,rs744166,...,rs744166,1.0,no,no,,9,,1.0,4,
8484,Multiple sclerosis,PUBMEDID:20453840,GWAS:A,multiple sclerosis,Nervous System Diseases,CBLB,4,1,2e-10,rs9657904,...,rs9657904,1.0,no,no,,9,,1.0,2,
8493,Multiple sclerosis,PUBMEDID:21833088,GWAS:A,multiple sclerosis,Nervous System Diseases,CD86,4,1,1e-11,rs9282641,...,rs9282641,1.0,no,no,,9,,1.0,3,
