# Predicting DWPC Query runtime ahead of time

In [19]:
import json

import matplotlib.pyplot
import pandas
import numpy
import seaborn
import mpld3

%matplotlib inline

In [20]:
path = 'data/all-features/metapaths.json'
with open(path) as fp:
    metapaths = json.load(fp)

In [21]:
auroc_df = pandas.read_table('data/all-features/auroc.tsv')
auroc_df.head(2)

Unnamed: 0,metapath,nonzero,seconds_per_query,auroc,auroc_permuted,delta_auroc,pval_auroc,length
0,CbGaD,0.312,0.0145,0.715,0.58,0.135,3e-06,2
1,CbGdD,0.149,0.0136,0.512,0.515,-0.00332,0.921,2


In [22]:
cols = ['complexity_max', 'complexity_mean', 'midpoint_complexity_mean', 'midpoint_complexity_max']

rows = [[item['abbreviation']] + [item[col] for col in cols] for item in metapaths]
complexity_df = pandas.DataFrame(rows, columns=['metapath'] + cols)
complexity_df = auroc_df.merge(complexity_df)
complexity_df['log10_seconds_per_query'] = numpy.log10(complexity_df['seconds_per_query'])

In [23]:
complexity_df.head(2)

Unnamed: 0,metapath,nonzero,seconds_per_query,auroc,auroc_permuted,delta_auroc,pval_auroc,length,complexity_max,complexity_mean,midpoint_complexity_mean,midpoint_complexity_max,log10_seconds_per_query
0,CbGaD,0.312,0.0145,0.715,0.58,0.135,3e-06,2,2.252853,1.395909,1.395909,2.252853,-1.838632
1,CbGdD,0.149,0.0136,0.512,0.515,-0.00332,0.921,2,2.167317,1.348874,1.348874,2.167317,-1.866461


## join on midpoint_complexity_max

In [24]:
matplotlib.pyplot.figure(figsize=(10, 7))
ax = seaborn.regplot('midpoint_complexity_max', 'log10_seconds_per_query', data=complexity_df,
    lowess=True, scatter_kws={'alpha': 0.5}, line_kws={'color': 'black'}, ci=False)
points = ax.collections[0]
labels = complexity_df.metapath.tolist()
tooltip = mpld3.plugins.PointLabelTooltip(points, labels)
mpld3.plugins.connect(ax.figure, tooltip)
mpld3.display()

## join on midpoint_complexity_mean

In [25]:
matplotlib.pyplot.figure(figsize=(10, 7))
ax = seaborn.regplot('midpoint_complexity_mean', 'log10_seconds_per_query', data=complexity_df,
    lowess=True, scatter_kws={'alpha': 0.5}, line_kws={'color': 'black'}, ci=False)
points = ax.collections[0]
labels = complexity_df.metapath.tolist()
tooltip = mpld3.plugins.PointLabelTooltip(points, labels)
mpld3.plugins.connect(ax.figure, tooltip)
mpld3.display()

## join complexity_mean

In [26]:
matplotlib.pyplot.figure(figsize=(10, 7))
ax = seaborn.regplot('complexity_mean', 'log10_seconds_per_query', data=complexity_df,
    lowess=True, scatter_kws={'alpha': 0.5}, line_kws={'color': 'black'}, ci=False)
points = ax.collections[0]
labels = complexity_df.metapath.tolist()
tooltip = mpld3.plugins.PointLabelTooltip(points, labels)
mpld3.plugins.connect(ax.figure, tooltip)
mpld3.display()

## join complexity_max

In [27]:
matplotlib.pyplot.figure(figsize=(10, 7))
ax = seaborn.regplot('complexity_max', 'log10_seconds_per_query', data=complexity_df,
    lowess=True, scatter_kws={'alpha': 0.5}, line_kws={'color': 'black'})
points = ax.collections[0]
labels = complexity_df.metapath.tolist()
tooltip = mpld3.plugins.PointLabelTooltip(points, labels)
mpld3.plugins.connect(ax.figure, tooltip)
mpld3.display()