# Prominent paths originating from epilepsy to a Compound

In [1]:
import math

import pandas
from neo4j import GraphDatabase
from tqdm.notebook import tqdm
import hetnetpy.readwrite
import hetnetpy.neo4j

from src.database_utils import get_db_connection

In [2]:
epilepsy_id = 'DOID:1826'

# Get top ten most important metapaths for Compound-epilepsy
query = f'''\
SELECT
    outer_pc.dwpc as dwpc,
    outer_pc.p_value as p_value,
    outer_pc.metapath_id as metapath_id,
    top_ids.source_name as source_name,
    top_ids.target_name as target_name
FROM (
    SELECT dwpc, p_value, metapath_id, source_id, target_id, n1.name AS source_name, n2.name AS target_name
    FROM dj_hetmech_app_pathcount pc
    JOIN dj_hetmech_app_node join_node
     ON pc.target_id=join_node.id OR pc.source_id=join_node.id
    JOIN dj_hetmech_app_node n1
     ON pc.source_id = n1.id
    JOIN dj_hetmech_app_node n2
     ON pc.target_id = n2.id
    WHERE join_node.identifier='{epilepsy_id}' AND (n1.metanode_id = 'Compound' OR n2.metanode_id = 'Compound')
    ORDER BY pc.p_value
) AS top_ids
JOIN dj_hetmech_app_pathcount outer_pc
ON (top_ids.source_id = outer_pc.source_id AND
    top_ids.target_id = outer_pc.target_id) OR
    (top_ids.source_id = outer_pc.target_id AND
    top_ids.target_id = outer_pc.source_id)
ORDER BY outer_pc.p_value;
'''

with get_db_connection() as connection:
    top_metapaths = pandas.read_sql(query, connection)

In [3]:
top_metapaths = top_metapaths.sort_values(by=['source_name', 'metapath_id'])

# Ensure that you only have one copy of each (source_name, metapath_id) pair
top_metapaths = top_metapaths.drop_duplicates(subset=['source_name', 'metapath_id'])
top_metapaths = top_metapaths.sort_values(by='p_value')
# Remove any rows with NaN values
top_metapaths = top_metapaths.dropna()
min_p_value = top_metapaths[top_metapaths.p_value != 0].p_value.min()
top_metapaths.loc[top_metapaths.p_value == 0, 'p_value'] = min_p_value
print(top_metapaths.p_value.min())
top_metapaths['neg_log_p_value'] = top_metapaths.p_value.apply(lambda x: -math.log10(x))
top_metapaths.head()

3.1318111315557476e-17


Unnamed: 0,dwpc,p_value,metapath_id,source_name,target_name,neg_log_p_value
0,3.509434,3.1318110000000004e-17,CcSEcCtD,Nitrazepam,epilepsy syndrome,16.504204
19,3.296422,5.733828e-17,CcSEcCtD,Bromazepam,epilepsy syndrome,16.241555
31,3.579689,7.03284e-17,CcSEcCtD,Lorazepam,epilepsy syndrome,16.152869
52,3.369589,7.210640000000001e-17,CcSEcCtD,Phenobarbital,epilepsy syndrome,16.142026
68,3.346266,2.518406e-16,CcSEcCtD,Ezogabine,epilepsy syndrome,15.598874


In [4]:
url = 'https://github.com/hetio/hetionet/raw/76550e6c93fbe92124edc71725e8c7dd4ca8b1f5/hetnet/json/hetionet-v1.0-metagraph.json'
metagraph = hetnetpy.readwrite.read_metagraph(url)

In [5]:
def get_paths_for_metapath(metagraph, row):
    '''
    Return a list of dictionaries containing the information for all paths with a given source, target, and metapath
    
    Parameters
    ----------
    metagraph : a hetnetpy.hetnet.Metagraph instance to interpret metapath abbreviations
    row : a row from a pandas dataframe with information about the given metapath, source, and target
    '''
    damping_exponent = .5
    
    metapath_data = metagraph.metapath_from_abbrev(row['metapath_id'])

    query = hetnetpy.neo4j.construct_pdp_query(metapath_data, path_style='string', property='name')

    driver = GraphDatabase.driver("bolt://neo4j.het.io")
    params = {
        'source': row['source_name'],
        'target': row['target_name'],
        'w': damping_exponent
    }
    with driver.session() as session:
        metapath_result = session.run(query, params)
        metapath_result = metapath_result.data()

    for path in metapath_result:
        path['metapath'] = row['metapath_id']
        path['metapath_importance'] = row['neg_log_p_value']
        path['path_importance'] = path['metapath_importance'] * path['percent_of_DWPC']
        path['source'] = row['source_name']
    
    metapath_df = pandas.DataFrame(metapath_result)
        
    return metapath_df

In [6]:
%%time
# For row in top_metapaths
result_list = []
for index, row in tqdm(top_metapaths.iterrows(), total=len(top_metapaths.index)):
    metapath_df = get_paths_for_metapath(metagraph, row)
    result_list.append(metapath_df)
result_df = pandas.concat(result_list, ignore_index=True)

  0%|          | 0/26281 [00:00<?, ?it/s]

CPU times: user 8min 47s, sys: 37.7 s, total: 9min 24s
Wall time: 2h 47min 25s


In [7]:
result_df = result_df.sort_values(by=['source', 'path_importance', 'metapath'], ascending=[True, False, True])
result_df.head()

Unnamed: 0,path,PDP,percent_of_DWPC,metapath,metapath_importance,path_importance,source
1181715,Abacavir–ADK–epilepsy syndrome,0.0118,100.0,CbGaD,1.968488,196.848818,Abacavir
1133896,Abacavir–ADH6–telencephalon–epilepsy syndrome,0.000303,31.412493,CbGdAlD,2.186818,68.693397,Abacavir
1486191,Abacavir–Cladribine–FOSB–epilepsy syndrome,0.000369,70.246011,CrCuGaD,0.93622,65.765728,Abacavir
1590278,Abacavir–ADK–CASP2–epilepsy syndrome,0.000138,64.897153,CbGr>GaD,0.724127,46.993812,Abacavir
1133897,Abacavir–ADH6–medulla oblongata–epilepsy syndrome,0.000155,16.09054,CbGdAlD,2.186818,35.187079,Abacavir


In [8]:
result_df.to_csv('data/epilepsy_paths.tsv.xz', index=False, sep='\t', float_format="%.5g")