# Processing gene-disease relationships from [DOAF](https://dx.doi.org/10.1371/journal.pone.0049686)

In [1]:
import pandas

In [2]:
# Downloaded from http://doa.nubic.northwestern.edu/pages/download.php
doaf_df = pandas.read_table('download/IDMappings.txt.gz', compression='gzip', dtype={'#DOID': str})
doaf_df = doaf_df.rename(columns={'#DOID': 'doid_code', 'Gene ID': 'GeneID'})
doaf_df.doid_code = 'DOID:' + doaf_df.doid_code

In [3]:
doaf_df.tail()

Unnamed: 0,doid_code,GeneID,PubMed ID,GeneRIF Text
144230,DOID:9993,1636,20546161,A strong association was found between ACE gen...
144231,DOID:9993,1636,21289265,ACE DD genotype was associated with an approxi...
144232,DOID:9993,3033,21252247,We recommend that HADH sequence analysis is co...
144233,DOID:9993,150,21298412,Antecedent hypoglycaemia did not affect beta(2...
144234,DOID:9993,948,20947105,These results suggested that type I CD36 defic...


In [4]:
grouped_df = doaf_df.groupby(['doid_code', 'GeneID']).apply(
    lambda x: pandas.Series({'count': len(x)})).reset_index()

In [5]:
# Read disease ontology terms
url = 'https://raw.githubusercontent.com/dhimmel/disease-ontology/72614ade9f1cc5a5317b8f6836e1e464b31d5587/data/term-names.tsv'
do_df = pandas.read_table(url)
do_df = do_df[do_df.type == 'name'][['doid', 'name']].rename(columns={'doid': 'doid_code', 'name': 'doid_name'})

In [6]:
# Read entrez genes
url = 'https://raw.githubusercontent.com/dhimmel/entrez-gene/6e133f9ef8ce51a4c5387e58a6cc97564a66cec8/data/genes-human.tsv'
entrez_df = pandas.read_table(url)
entrez_df = entrez_df[entrez_df.type_of_gene == 'protein-coding'][['GeneID', 'Symbol']]

In [7]:
grouped_df = do_df.merge(entrez_df.merge(grouped_df))
grouped_df = grouped_df.sort(['doid_code', 'count', 'Symbol'], ascending=[True, False, True])

In [8]:
grouped_df.head()

Unnamed: 0,doid_code,doid_name,GeneID,Symbol,count
40612,DOID:0001816,angiosarcoma,302,ANXA2,1
40613,DOID:0001816,angiosarcoma,595,CCND1,1
40614,DOID:0001816,angiosarcoma,2324,FLT4,1
40615,DOID:0001816,angiosarcoma,3091,HIF1A,1
40616,DOID:0001816,angiosarcoma,3791,KDR,1


In [9]:
grouped_df.to_csv('data/doaf.tsv', sep='\t', index=False)