# Source edge contribution to epilepsy predictions

In [1]:
import functools

import pandas
import requests

from neo4j.v1 import GraphDatabase

In [2]:
url = 'https://github.com/dhimmel/learn/raw/d2251a942813015d0362a90f179c961016336e77/summary/compounds.tsv'
compound_df = pandas.read_table(url)[['compound_id', 'compound_name']]

In [3]:
top_compounds_df = (pandas.read_table('./data/windows.tsv')
    .rename(columns={'name': 'compound_name'})
    .merge(compound_df)
)
top_compounds_df.head(2)

Unnamed: 0,compound_name,prediction,disease_pctl,phcodb,trials,category,min_pred,max_pred,freq_AIGD,freq_IGD,freq_UNKD,compound_id
0,Topiramate,0.603,1.0,DM,35,AIGD,0.46,0.603,1.0,0.0,0.0,DB00273
1,Ethotoin,0.589,0.9993,,0,AIGD,0.434,0.603,1.0,0.0,0.0,DB00754


In [4]:
url = 'https://github.com/dhimmel/learn/raw/d2251a942813015d0362a90f179c961016336e77/all-features/data/metapaths.json'
metapaths = requests.get(url).json()
metapath_to_cypher = {m['abbreviation']: m['dwpc_query'] for m in metapaths}
metapath_to_first_metaedge = {m['abbreviation']: m['edges'][0].split(' - ')[1] for m in metapaths}

template = '''
{head}, n0, n1
RETURN
  n0.identifier AS n0_id,
  type(head(relationships(path))) AS e1_type,
  n1.identifier AS n1_id,
  n0.name AS n0_name,
  n1.name AS n1_name,
{tail}
'''

@functools.lru_cache()
def get_dwpc_query(metapath):
    """
    Get a first-edge specific DWPC query for a given metapath abbreviation.
    """
    dwpc_cypher = metapath_to_cypher[metapath]
    cypher_head, cypher_tail = dwpc_cypher.split('RETURN')
    cypher = template.format(head=cypher_head.rstrip(), tail=cypher_tail.lstrip())
    return cypher

In [5]:
driver = GraphDatabase.driver("bolt://neo4j.het.io")
session = driver.session()

In [6]:
def run_query(compound_id, disease_id, metapath, contribution=0):
    query = get_dwpc_query(metapath)
    parameters = {
        'source': compound_id,
        'target': disease_id,
        'w': 0.4,
    }
    result = session.run(query, parameters)
    result_df = pandas.DataFrame((x.values() for x in result), columns=result.keys())
    result_df.insert(0, 'metapath', metapath)
    result_df['prediction_contrib'] = contribution * result_df.DWPC / result_df.DWPC.sum()
    metaedge = metapath_to_first_metaedge[metapath]
    result_df['relationship'] = result_df.n0_name + '—' + metaedge + '—' + result_df.n1_name
    return result_df

In [7]:
disease_id = 'DOID:1826'
leadoff_dfs = list()
for compound_id in top_compounds_df.compound_id:
    url = 'https://github.com/dhimmel/het.io-rep-data/raw/{commit}/prediction-info/{compound_id}/{disease_webid}/info.json'.format(
        commit = '1a960f0e353586f8fe9f61b569919f24603d4344',
        compound_id = compound_id,
        disease_webid = disease_id.replace(':', '_'),
    )
    info = requests.get(url).json()
    dfs = list()
    for metapath, contribution in info['metapath_contribution'].items():
        df = run_query(compound_id, disease_id, metapath, contribution)
        dfs.append(df)
    leadoff_df = pandas.concat(dfs)
    leadoff_df = leadoff_df.groupby(['relationship', 'n1_id',]).sum().reset_index().drop('DWPC', axis=1).sort_values('prediction_contrib', ascending=False)
    leadoff_dfs.append(leadoff_df)

In [58]:
leadoff_dfs[5].head(5)

Unnamed: 0,relationship,n1_id,PC,prediction_contrib
132,Lorazepam—includes—Benzodiazepines,N0000007542,4,0.266422
133,Lorazepam—palliates—panic disorder,DOID:594,2,0.111977
2,Lorazepam—binds—GABRG2,2566,208,0.082644
0,Lorazepam—binds—GABRA1,2554,243,0.071962
142,Lorazepam—resembles—Clonazepam,DB01068,7,0.067994


In [59]:
total_df = pandas.concat(leadoff_dfs)
total_df = total_df.groupby(total_df.relationship.map(lambda x: 'Compound—' + x.split('—', 1)[1])).sum().reset_index().sort_values('prediction_contrib', ascending=False)
total_df.head(5)

Unnamed: 0,relationship,PC,prediction_contrib
1437,Compound—includes—Decreased Central Nervous Sy...,238,6.340446
1429,Compound—includes—Benzodiazepines,52,3.84467
104,Compound—binds—GABRA1,12385,2.819158
1519,Compound—resembles—Diazepam,402,2.708444
1438,Compound—includes—General Anesthesia,6,2.45672


In [60]:
total_df.to_csv('data/source-edges-combined.tsv', sep='\t', index=False)
len(total_df)

1667

In [61]:
session.close()