In [1]:
import sys

import pandas

# local imports
sys.path.insert(0, '../')
import utils

## Read DO Slim

In [2]:
commit = '72614ade9f1cc5a5317b8f6836e1e464b31d5587'
url = utils.rawgit('dhimmel', 'disease-ontology', commit, 'data/slim-terms.tsv')
disease_df = pandas.read_table(url)
disease_df = disease_df.rename(columns={'doid': 'doid_id', 'name': 'doid_name'})
disease_df = disease_df[['doid_id', 'doid_name']]
disease_df.head(2)

Unnamed: 0,doid_id,doid_name
0,DOID:2531,Hematologic cancer
1,DOID:1319,Brain cancer


## Read Entrez Gene

In [3]:
commit = '6e133f9ef8ce51a4c5387e58a6cc97564a66cec8'
url = utils.rawgit('dhimmel', 'entrez-gene', commit, 'data/genes-human.tsv')
gene_df = pandas.read_table(url)
gene_df = gene_df[gene_df.type_of_gene == 'protein-coding']
gene_df = gene_df.rename(columns={'GeneID': 'entrez_gene_id', 'Symbol': 'gene_symbol'})
gene_df = gene_df[['entrez_gene_id', 'gene_symbol']]
gene_df.head(2)

Unnamed: 0,entrez_gene_id,gene_symbol
0,1,A1BG
1,2,A2M


## Read datasets

In [4]:
# DISEASES
commit = 'e0089ef89a56348d7d4e0684a9c51c5747b16237'
url = utils.rawgit('dhimmel', 'diseases', commit, 'data/merged-slim.tsv')
diseases_df = pandas.read_table(url)
diseases_df.head(2)

Unnamed: 0,doid_id,doid_name,entrez_gene_id,gene_symbol,score_text,score_knowledge,score_cosmic,score_distild,score_integrated_no_distild,score_integrated
0,DOID:13223,uterine fibroid,60,ACTB,0.8,,,,0.8,0.8
1,DOID:13223,uterine fibroid,71,ACTG1,0.8,,,,0.8,0.8


In [5]:
# DOAF
commit = 'bbe1c326aa385416e36d02b144e89e2b99e700b6'
url = utils.rawgit('dhimmel', 'doaf', commit, 'data/doaf.tsv')
doaf_df = pandas.read_table(url)
doaf_df = doaf_df.rename(columns={'doid_code': 'doid_id', 'GeneID': 'entrez_gene_id'})
doaf_df.head(3)

Unnamed: 0,doid_id,doid_name,entrez_gene_id,Symbol,count
0,DOID:0001816,angiosarcoma,302,ANXA2,1
1,DOID:0001816,angiosarcoma,595,CCND1,1
2,DOID:0001816,angiosarcoma,2324,FLT4,1


In [6]:
# DisGeNET
commit = 'fdc5f42f2da745cbf71d7b4cc5021de5685e4a11'
url = utils.rawgit('dhimmel', 'disgenet', commit, 'data/consolidated.tsv')
disgenet_df = pandas.read_table(url)
disgenet_df = disgenet_df.rename(columns={'doid_code': 'doid_id', 'geneId': 'entrez_gene_id'})
disgenet_df.head(2)

Unnamed: 0,doid_id,doid_name,entrez_gene_id,geneSymbol,count,pubmeds_max,score_max,score_mean,associationType,source
0,DOID:0050156,idiopathic pulmonary fibrosis,729238,SFTPA2,1,1,0.620284,0.620284,Biomarker|GeneticVariation,BeFree|CLINVAR|CTD_human|UNIPROT
1,DOID:0050156,idiopathic pulmonary fibrosis,7015,TERT,1,10,0.422153,0.422153,Biomarker|GeneticVariation,BeFree|CLINVAR|CTD_human|GAD|LHGDN


In [7]:
# hetio GWAS
commit = '0617ea7ea8268f21f5ca1b8dbe487dd12671fc7b'
url = utils.rawgit('dhimmel', 'gwas-catalog', commit, 'data/gene-associations.tsv')
gwas_df = pandas.read_table(url)
gwas_df = gwas_df.rename(columns={'doid_code': 'doid_id', 'gene': 'entrez_gene_id'})
gwas_df.head(2)

Unnamed: 0,doid_id,doid_name,locus,high_confidence,primary,status,entrez_gene_id,symbol
0,DOID:9970,obesity,0,1,1,HC-P,3953,LEPR
1,DOID:9970,obesity,14,1,1,HC-P,4094,MAF


## Filters

In [8]:
diseases_df = diseases_df.query('score_integrated_no_distild >= 2')
doaf_df = doaf_df.query('count >= 3')
disgenet_df = disgenet_df.query('score_max >= 0.06')
gwas_df = gwas_df[gwas_df.status == 'HC-P']

## Combine

In [9]:
diseases_df['provenance'] = 'DISEASES'
doaf_df['provenance'] = 'DOAF'
disgenet_df['provenance'] = 'DisGeNET'
gwas_df['provenance'] = 'GWAS Catalog'

diseases_df['license'] = 'CC BY 4.0'
doaf_df['license'] = ''
disgenet_df['license'] = 'ODbL 1.0'
gwas_df['license'] = 'CC BY 4.0'

In [10]:
dfs = [df[['doid_id', 'entrez_gene_id', 'provenance', 'license']]
       for df in (diseases_df, doaf_df, disgenet_df, gwas_df)]
concat_df = pandas.concat(dfs)
concat_df = disease_df.merge(gene_df.merge(concat_df))
concat_df.provenance.value_counts()

DisGeNET        7552
DISEASES        4990
DOAF            1649
GWAS Catalog    1284
Name: provenance, dtype: int64

In [11]:
def condense(df):
    """Consolidate multiple associations into a single Series."""
    row = pandas.Series()
    row['sources'] = '|'.join(df.provenance)
    licenses = set(df.license)
    licenses.discard('')
    try:
        row['license'], = licenses
    except ValueError:
        row['license'] = None
    return row

short_df = concat_df.groupby(['doid_id', 'entrez_gene_id']).apply(condense).reset_index()
short_df = disease_df.merge(gene_df.merge(short_df))
short_df.head()

Unnamed: 0,doid_id,doid_name,entrez_gene_id,gene_symbol,sources,license
0,DOID:2531,Hematologic cancer,25,ABL1,DISEASES|DisGeNET,
1,DOID:2531,Hematologic cancer,27,ABL2,DisGeNET,ODbL 1.0
2,DOID:2531,Hematologic cancer,54,ACP5,DISEASES,CC BY 4.0
3,DOID:2531,Hematologic cancer,113,ADCY7,DisGeNET,ODbL 1.0
4,DOID:2531,Hematologic cancer,142,PARP1,DISEASES|DisGeNET,


In [12]:
short_df.to_csv('DaG-association.tsv', sep='\t', index=False)