# Investigate query runtime, join indexes, and complexity estimation

By evaluating all join indexes

In [1]:
import json
import random
import time
import bz2

import numpy
import pandas
import py2neo

import hetio.neo4j
import hetio.readwrite

## Set up and execute neo4j queries

In [2]:
args = {
    'n_metapaths': 75,
    'port': 7500,
    'n_positives': 150
}

In [3]:
# Load metagraph for the hetnet
commit = '1229536c6d2146c4cae97f045cf8cbdd272420f6'
url = 'https://github.com/dhimmel/integrate/raw/{}/data/metagraph.json'.format(commit)
metagraph = hetio.readwrite.read_metagraph(url)

In [4]:
# Read metapath information
with open('../../all-features/data/metapaths.json') as fp:
    metapaths = json.load(fp)

# Randomly select a subset of metapaths
random.seed(0)
metapaths = random.sample(metapaths, args['n_metapaths'])
abbrev_to_item = {item['abbreviation']: item for item in metapaths}
metapaths = [metagraph.metapath_from_abbrev(metapath['abbreviation']) for metapath in metapaths]

In [5]:
# Create a dataframe of estimated complexity
rows = list()
for abbrev, item in abbrev_to_item.items():
    for i, complexity, in enumerate(item['join_complexities']):
        row = abbrev, i, complexity, i == item['optimal_join_index'], i == item['midpoint_index']
        rows.append(row)

index_df = pandas.DataFrame(rows, columns=['metapath', 'join_index', 'complexity', 'optimal_index', 'midpoint_index'])
index_df.head(2)

Unnamed: 0,metapath,join_index,complexity,optimal_index,midpoint_index
0,CbGbCuGuD,0,2.318345,False,False
1,CbGbCuGuD,1,2.584564,False,False


In [6]:
# Create a dataframe of queries
rows = list()
for metapath in metapaths:
    indexes = [False] + list(range(len(metapath) + 1))
    random.shuffle(indexes)
    for i in indexes:
        query = hetio.neo4j.construct_dwpc_query(metapath, join_hint=i)
        rows.append((str(metapath), query, i))
query_df = pandas.DataFrame(rows, columns=['metapath', 'query', 'join_index'])
query_df.head(2)

Unnamed: 0,metapath,query,join_index
0,CrCdGuCpD,MATCH path = (n0:Compound)-[:RESEMBLES_CrC]-(n...,2
1,CrCdGuCpD,MATCH path = (n0:Compound)-[:RESEMBLES_CrC]-(n...,4


In [7]:
# Connect to neo4j server
uri = 'http://localhost:{}/db/data/'.format(args['port'])
neo = py2neo.Graph(uri)

In [8]:
# Retreive network positives
query ='''
MATCH (c:Compound)-[:TREATS_CtD]->(d:Disease)
RETURN d.name AS disease, c.name AS compound
'''
results = neo.cypher.execute(query)
pair_df = pandas.DataFrame(results.records, columns=results.columns)
pair_df = pair_df.sample(args['n_positives'], random_state=0)

In [9]:
# Merge queries and pairs
pair_df['dummy'] = 1
query_df['dummy'] = 1
query_df = pair_df.merge(query_df)
del query_df['dummy']
query_df.head(2)

Unnamed: 0,disease,compound,metapath,query,join_index
0,type 2 diabetes mellitus,Losartan,CrCdGuCpD,MATCH path = (n0:Compound)-[:RESEMBLES_CrC]-(n...,2
1,type 2 diabetes mellitus,Losartan,CrCdGuCpD,MATCH path = (n0:Compound)-[:RESEMBLES_CrC]-(n...,4


In [10]:
# Number of queries
len(query_df)

66600

In [11]:
%%time

# Perform queries
rows = list()
for row in query_df.itertuples():
    start = time.time()
    result = neo.cypher.execute(row.query, source=row.compound, target=row.disease, w=0.4)
    seconds = time.time() - start
    row = list(row)[1:] + list(result.records[0]) + [seconds]
    rows.append(row)
    print('Completed {:.4%}'.format(len(rows) / len(query_df)), end='\r')

result_df = pandas.DataFrame(rows, columns= list(query_df.columns) + list(result.columns) + ['seconds'])

CPU times: user 2min 25s, sys: 15.7 s, total: 2min 40s
Wall time: 1d 3h 36min 32s


In [12]:
len(rows)

66600

In [13]:
result_df.head(2)

Unnamed: 0,disease,compound,metapath,query,join_index,PC,DWPC,seconds
0,type 2 diabetes mellitus,Losartan,CrCdGuCpD,MATCH path = (n0:Compound)-[:RESEMBLES_CrC]-(n...,2,0,0.0,0.076564
1,type 2 diabetes mellitus,Losartan,CrCdGuCpD,MATCH path = (n0:Compound)-[:RESEMBLES_CrC]-(n...,4,0,0.0,0.08081


## Analyze query results

In [14]:
complexity_df = result_df.merge(index_df).sort_values(['disease', 'compound', 'metapath', 'join_index'])
del complexity_df['query']
complexity_df.seconds = complexity_df.seconds.astype(float)
complexity_df['log_seconds'] = numpy.log10(complexity_df.seconds)
complexity_df.head(3)

Unnamed: 0,disease,compound,metapath,join_index,PC,DWPC,seconds,complexity,optimal_index,midpoint_index,log_seconds
19950,Barrett's esophagus,Esomeprazole,CbG<rGr>GaD,0,0,0.0,0.001178,3.913263,False,False,-2.928933
19951,Barrett's esophagus,Esomeprazole,CbG<rGr>GaD,False,0,0.0,0.001522,3.913263,False,False,-2.817635
20175,Barrett's esophagus,Esomeprazole,CbG<rGr>GaD,1,0,0.0,0.198892,4.171191,False,False,-0.701383


In [15]:
with bz2.open('data/query-results.tsv.bz2', 'wt') as write_file:
    complexity_df.to_csv(write_file, sep='\t', index=False, float_format='%.5g')

In [16]:
len(complexity_df)

66600

In [27]:
# Summarize by query
def summary(df):
    row = pandas.Series()
    
    no_hint = df.join_index.map(lambda x: x is False)
    no_hint_seconds = df[no_hint].seconds.iloc[0]
    df = df[-no_hint]
    
    df = df.sort_values('seconds')
    row['optimal_is_best'] = df.optimal_index.iloc[0]
    row['midpoint_is_best'] = df.midpoint_index.iloc[0]
    
    df = df.sort_values('complexity')
    row['rank_of_fastest'] = 1 + numpy.argmin(list(df.seconds))
    min_seconds = df.seconds.min()
    row['min_seconds'] = min_seconds
    row['delay_of_optimal'] = df.loc[df.optimal_index, 'seconds'].iloc[0] - min_seconds
    row['delay_of_midpoint'] = df.loc[df.midpoint_index, 'seconds'].iloc[0] - min_seconds
    row['delay_of_nohint'] = no_hint_seconds - min_seconds

    return row

query_summary_df = complexity_df.groupby(['disease', 'compound', 'metapath']).apply(summary).reset_index()
for column in 'optimal_is_best', 'midpoint_is_best', 'rank_of_fastest':
    query_summary_df[column] = query_summary_df[column].astype(int)
query_summary_df.head()

Unnamed: 0,disease,compound,metapath,optimal_is_best,midpoint_is_best,rank_of_fastest,min_seconds,delay_of_optimal,delay_of_midpoint,delay_of_nohint
0,Barrett's esophagus,Esomeprazole,CbG<rGr>GaD,0,0,4,0.001178,5.8e-05,0.000185,0.000344
1,Barrett's esophagus,Esomeprazole,CbGaDlAlD,0,1,2,0.002339,0.000865,0.0,0.020284
2,Barrett's esophagus,Esomeprazole,CbGbCuGuD,0,1,2,0.015859,0.085478,0.0,0.076385
3,Barrett's esophagus,Esomeprazole,CbGcG<rGaD,0,0,2,0.003073,0.004003,0.000934,0.008068
4,Barrett's esophagus,Esomeprazole,CbGcGbCpD,0,1,3,0.001308,0.001692,0.0,5.1e-05


In [28]:
# Percent of the time that each complexity rank (higher is harder) was fastest
query_summary_df.rank_of_fastest.value_counts(normalize=True)

1    0.292622
2    0.275378
3    0.169867
4    0.146933
5    0.115200
Name: rank_of_fastest, dtype: float64

In [29]:
# Mean of each column: Percentages for binary columns
query_summary_df.mean()

optimal_is_best      0.292356
midpoint_is_best     0.403289
rank_of_fastest      2.516711
min_seconds          0.222316
delay_of_optimal     0.117095
delay_of_midpoint    0.006333
delay_of_nohint      0.434539
dtype: float64

In [30]:
metapath_summary_df = query_summary_df.groupby('metapath').mean().reset_index()
metapath_summary_df.head(2)

Unnamed: 0,metapath,optimal_is_best,midpoint_is_best,rank_of_fastest,min_seconds,delay_of_optimal,delay_of_midpoint,delay_of_nohint
0,CbG<rGr>GaD,0.04,0.706667,2.273333,0.056948,0.343988,0.005611,0.33951
1,CbGaDlAlD,0.24,0.753333,1.773333,0.009452,0.002628,0.000652,0.08313


In [43]:
def summary(df):
    row = pandas.Series()
    best_time = df.seconds.min()
    row['optimal_time'] = df.loc[df.optimal_index, 'seconds'].iloc[0]
    row['midpoint_time']  = df.loc[df.midpoint_index, 'seconds'].iloc[0]
    row['no_hint_time'] = df.loc[df.join_index == 'no_hint', 'seconds'].iloc[0]
    return row

metapath_choice_df = complexity_df.replace({'join_index': {False: 'no_hint'}}).groupby(
    ['metapath', 'join_index', 'optimal_index', 'midpoint_index']
).seconds.mean().reset_index().groupby('metapath').apply(summary).reset_index()
metapath_choice_df.head()

Unnamed: 0,metapath,optimal_time,midpoint_time,no_hint_time
0,CbG<rGr>GaD,0.400937,0.062559,0.397101
1,CbGaDlAlD,0.01208,0.010104,0.092532
2,CbGbCuGuD,0.140511,0.010958,0.140407
3,CbGcG<rGaD,0.013109,0.02864,0.012834
4,CbGcGbCpD,0.002972,0.002147,0.003165


In [44]:
metapath_choice_df.mean()

optimal_time     0.339411
midpoint_time    0.228649
no_hint_time     0.656398
dtype: float64

In [45]:
query_summary_df.to_csv('data/query-summaries.tsv', sep='\t', index=False, float_format='%.4g')
metapath_summary_df.to_csv('data/metapath-summaries.tsv', sep='\t', index=False, float_format='%.4g')
metapath_choice_df.to_csv('data/index-choice-by-metapath.tsv', sep='\t', index=False, float_format='%.4g')

In [None]:
# import seaborn
# %matplotlib inline
# ax = seaborn.lmplot(x='complexity', y='log_seconds', data=complexity_df, lowess=True, row='metapath', hue='join_index')