In [1]:
import pandas

import hetio.graph

In [2]:
def rawgit(handle, repo, commit, *args):
    """Returns url for a raw file in a ghithub reposotory."""
    url_head = 'https://raw.githubusercontent.com'
    return '/'.join((url_head, handle, repo, commit) + args)

In [3]:
metaedge_tuples = [
    ('compound', 'disease', 'indication', 'both'),
    ('compound', 'gene', 'target', 'both'),
    ('gene', 'gene', 'interaction', 'both'),
    ('gene', 'pathway', 'participation', 'both'),
    ('disease', 'gene', 'association', 'both'),
    ('disease', 'symptom', 'causation', 'both'),
]
metagraph = hetio.graph.MetaGraph.from_edge_tuples(metaedge_tuples)
graph = hetio.graph.Graph(metagraph)

## Gene Nodes

In [4]:
commit = '6e133f9ef8ce51a4c5387e58a6cc97564a66cec8'
url = rawgit('dhimmel', 'entrez-gene', commit, 'data/genes-human.tsv')
gene_df = pandas.read_table(url)
gene_df = gene_df[gene_df.type_of_gene == 'protein-coding']
gene_df.head()

Unnamed: 0,tax_id,GeneID,Symbol,chromosome,map_location,type_of_gene,description
0,9606,1,A1BG,19,19q13.4,protein-coding,alpha-1-B glycoprotein
1,9606,2,A2M,12,12p13.31,protein-coding,alpha-2-macroglobulin
3,9606,9,NAT1,8,8p22,protein-coding,N-acetyltransferase 1 (arylamine N-acetyltrans...
4,9606,10,NAT2,8,8p22,protein-coding,N-acetyltransferase 2 (arylamine N-acetyltrans...
6,9606,12,SERPINA3,14,14q32.1,protein-coding,"serpin peptidase inhibitor, clade A (alpha-1 a..."


In [5]:
for i, row in gene_df.iterrows():
    if row.type_of_gene != 'protein-coding':
        continue
    graph.add_node(kind = 'gene', identifier = row.GeneID, name = row.Symbol)

## Disease Nodes

In [6]:
commit = '72614ade9f1cc5a5317b8f6836e1e464b31d5587'
url = rawgit('dhimmel', 'disease-ontology', commit, 'data/slim-terms.tsv')
disease_df = pandas.read_table(url)
disease_df.head()

Unnamed: 0,doid,name,source,pathophysiology
0,DOID:2531,Hematologic cancer,DOcancerslim,neoplastic
1,DOID:1319,Brain cancer,DOcancerslim,neoplastic
2,DOID:1324,Lung cancer,DOcancerslim,neoplastic
3,DOID:263,Kidney cancer,DOcancerslim,neoplastic
4,DOID:1793,Pancreatic cancer,DOcancerslim,neoplastic


In [7]:
for i, row in disease_df.iterrows():
    graph.add_node(kind = 'disease', identifier = row.doid, name = row.name)

## Compound Nodes

In [8]:
commit = '3e87872db5fca5ac427ce27464ab945c0ceb4ec6'
url = rawgit('dhimmel', 'drugbank', commit, 'data/drugbank-slim.tsv')
compound_df = pandas.read_table(url)
compound_df.head()

Unnamed: 0,drugbank_id,name,type,groups,atc_codes,categories,inchikey,inchi
0,DB00014,Goserelin,small molecule,approved,L02AE03,,InChIKey=BLCLNMBMMGCOAS-URPVMXJPSA-N,InChI=1S/C59H84N18O14/c1-31(2)22-40(49(82)68-3...
1,DB00035,Desmopressin,small molecule,approved,H01BA02,Antidiuretic Agents|Hemostatics|Renal Agents,InChIKey=NFLWUMRGJYTJIN-NXBWRCJVSA-N,InChI=1S/C46H64N14O12S2/c47-35(62)15-14-29-40(...
2,DB00050,Cetrorelix,small molecule,approved|investigational,H01CC02,Hormone Antagonists|Fertility Agents,InChIKey=SBNPWPIBESPSIF-MHWMIDJBSA-N,InChI=1S/C70H92ClN17O14/c1-39(2)31-52(61(94)82...
3,DB00091,Cyclosporine,small molecule,approved|investigational,L04AD01|S01XA18,Antirheumatic Agents|Dermatologic Agents|Immun...,InChIKey=PMATZTZNYRCHOR-IMVLJIQENA-N,InChI=1/C62H111N11O12/c1-25-27-28-40(15)52(75)...
4,DB00093,Felypressin,small molecule,approved,,Vasoconstrictor Agents|Renal Agents,InChIKey=SFKQVVDKFKYTNA-YVGXZPIDNA-N,InChI=1/C46H65N13O11S2/c47-18-8-7-14-29(40(64)...


In [9]:
for i, row in compound_df.iterrows():
    data = {'inchikey': row.inchikey, 'inchi': row.inchi}
    graph.add_node(kind='compound', identifier=row.drugbank_id, name=row.name, data=data)

## Symptom Nodes

In [10]:
commit = 'a7036a37302973b15ab949aab4056d9bc062910e'
url = rawgit('dhimmel', 'mesh', commit, 'data/symptoms.tsv')
symptom_df = pandas.read_table(url)
symptom_df.head()

Unnamed: 0,mesh_id,mesh_name,in_hsdn
0,D000006,"Abdomen, Acute",1
1,D000270,Adie Syndrome,0
2,D000326,Adrenoleukodystrophy,0
3,D000334,Aerophagy,1
4,D000370,Ageusia,1


In [11]:
for i, row in symptom_df.iterrows():
    graph.add_node(kind='symptom', identifier=row.mesh_id, name=row.mesh_name)

## Pathway Nodes and Edges

In [12]:
commit = '032036f91a8395eabd0dab2d9d1ee3252ba140f8'
url = rawgit('dhimmel', 'pathways', commit, 'data/pathways.tsv')
pathway_df = pandas.read_table(url)
pathway_df = pathway_df[pathway_df.n_coding_genes > 1]
pathway_df.tail(2)

Unnamed: 0,identifier,name,url,n_genes,n_coding_genes,source,genes,coding_genes
1617,WP1946,Cori Cycle(Homo sapiens),http://wikipathways.org/instance/WP1946,16,16,wikipathways,5223|5214|6514|5230|2875|5232|2821|6888|2539|6...,6513|5223|5214|3098|6514|5230|2875|5232|2821|6...
1618,WP2118,Arrhythmogenic Right Ventricular Cardiomyopath...,http://wikipathways.org/instance/WP2118,74,74,wikipathways,775|782|6546|3685|6445|93589|59283|781|1829|10...,51176|775|6444|782|3694|3728|6932|786|6546|71|...


In [13]:
for i, row in pathway_df.iterrows():
    pathway_id = row.identifier
    graph.add_node(kind='pathway', identifier=pathway_id, name=row.name)
    
    for gene in row.coding_genes.split('|'):
        gene = int(gene)
        source_id = 'gene', gene
        target_id = 'pathway', pathway_id
        graph.add_edge(source_id, target_id, 'participation', 'both')

## Disease-gene associations

In [14]:
commit = '0617ea7ea8268f21f5ca1b8dbe487dd12671fc7b'
url = rawgit('dhimmel', 'gwas-catalog', commit, 'data/gene-associations.tsv')
gwas_df = pandas.read_table(url)
gwas_df = gwas_df[gwas_df.status == 'HC-P']
gwas_df.tail(2)

Unnamed: 0,doid_code,doid_name,locus,high_confidence,primary,status,gene,symbol
5895,DOID:0050156,idiopathic pulmonary fibrosis,0,1,1,HC-P,54472,TOLLIP
5896,DOID:0050156,idiopathic pulmonary fibrosis,2,1,1,HC-P,7015,TERT


In [15]:
len(gwas_df)

1284

In [16]:
for i, row in gwas_df.iterrows():
    source_id = 'disease', row.doid_code
    target_id = 'gene', row.gene
    graph.add_edge(source_id, target_id, 'association', 'both')

## Drug targets

In [17]:
commit = '3e87872db5fca5ac427ce27464ab945c0ceb4ec6'
url = rawgit('dhimmel', 'drugbank', commit, 'data/proteins.tsv')
drugbank_protein_df = pandas.read_table(url)
drugbank_protein_df = drugbank_protein_df.merge(compound_df[['drugbank_id']])
drugbank_protein_df = drugbank_protein_df[drugbank_protein_df.entrez_gene_id.isin(gene_df.GeneID)]
drugbank_protein_df.head(2)

Unnamed: 0,drugbank_id,category,uniprot_id,entrez_gene_id,organism,known_action,actions,pubmed_ids
0,DB00170,target,P00734,2147,Human,unknown,activator,10469489|2484931|2922761|6867080|7657295
1,DB00170,target,P00742,2159,Human,unknown,activator,17139284|17016423


In [18]:
target_df = drugbank_protein_df[drugbank_protein_df.category == 'target']
for i, row in target_df.iterrows():
    source_id = 'compound', row.drugbank_id
    target_id = 'gene', row.entrez_gene_id
    graph.add_edge(source_id, target_id, 'target', 'both')

## Summary

In [19]:
metanode_to_nodes = graph.get_metanode_to_nodes()
for metanode, nodes in metanode_to_nodes.items():
    print(metanode, len(nodes))

symptom 438
disease 137
gene 20971
compound 1552
pathway 1615


In [21]:
metaedge_to_edges = graph.get_metaedge_to_edges(exclude_inverts=True)
for metaedge, edges in metaedge_to_edges.items():
    print(metaedge, len(edges))

compound - target - gene 4603
gene - interaction - gene 0
gene - participation - pathway 79101
disease - causation - symptom 0
disease - association - gene 1284
compound - indication - disease 0
