In [1]:
%load_ext autoreload
%autoreload 2

import time

import numpy as np
import pandas as pd
import pytest
import tqdm

import hetmech.degree_weight
import hetmech.hetmat

In [2]:
# Three hetmats so that there is no cache sharing
hetmat = hetmech.hetmat.HetMat('../data/hetionet-v1.0.hetmat/')
hetmat.path_counts_cache = hetmech.hetmat.PathCountPriorityCache(hetmat, allocate_GB=5)

hetmat_nocache = hetmech.hetmat.HetMat('../data/hetionet-v1.0.hetmat/')

hetmat_rec = hetmech.hetmat.HetMat('../data/hetionet-v1.0.hetmat/')
hetmat_rec.path_counts_cache = hetmech.hetmat.PathCountPriorityCache(hetmat_rec, allocate_GB=5)

hetmat_rec_nocache = hetmech.hetmat.HetMat('../data/hetionet-v1.0.hetmat/')

hetmat_chain = hetmech.hetmat.HetMat('../data/hetionet-v1.0.hetmat/')
hetmat_rec.path_counts_cache = hetmech.hetmat.PathCountPriorityCache(hetmat_chain, allocate_GB=5)

hetmat_chain_nocache = hetmech.hetmat.HetMat('../data/hetionet-v1.0.hetmat/')

In [3]:
# Rephetio metapaths
metapaths = list()
for metapath in hetmat.metagraph.extract_metapaths('Compound', 'Disease', max_length=4):
    if hetmech.degree_weight.categorize(metapath) in {'long_repeat', 'other'}:
        continue
    metapaths.append(metapath)
len(metapaths)

1172

In [4]:
def equal_outputs(metapath, dense_threshold=0, dtype=np.float64):
    """Compute DWWC using all three functions, and ensure that they give the same output."""
    time1 = time.time()
    
    # Original DWWC method with cache
    row_original, col_original, dwwc_original = hetmech.degree_weight.dwwc(
        hetmat, metapath, dense_threshold=dense_threshold, dtype=dtype)
    time2 = time.time()
    
    # Original DWWC method no cache
    row_original, col_original, dwwc_original = hetmech.degree_weight.dwwc(
        hetmat_nocache, metapath, dense_threshold=dense_threshold, dtype=dtype)
    time3 = time.time()
    
    # Recursive with cache
    row_rec, col_rec, dwwc_rec = hetmech.degree_weight.dwwc_recursive(
        hetmat_rec, metapath, dense_threshold=dense_threshold, dtype=dtype)
    time4 = time.time()
    
    # Recursive no cache
    row_rec, col_rec, dwwc_rec = hetmech.degree_weight.dwwc_recursive(
        hetmat_rec_nocache, metapath, dense_threshold=dense_threshold, dtype=dtype)
    time5 = time.time()
    
    # Chain ordering with cache
    row_chain, col_chain, dwwc_chain = hetmech.degree_weight.dwwc_chain(
        hetmat_chain, metapath, dense_threshold=dense_threshold, dtype=dtype)
    time6 = time.time()
    
    # Chain ordering no cache
    row_chain, col_chain, dwwc_chain = hetmech.degree_weight.dwwc_chain(
        hetmat_chain_nocache, metapath, dense_threshold=dense_threshold, dtype=dtype)
    time7 = time.time()
    
    # Metapath \ abbrev \ dwwc \ dwwc (nocache) \ recursive \ recursive (nocache) \ chain ordering \ chain ordering (nocache)
    times = [metapath, str(metapath), time2 - time1, time3 - time2, time4 - time3, time5 - time4, time6 - time5, time7 - time6]
    
    # Ensure identical outputs
    try:
        assert (row_rec == row_original) and (row_chain == row_original) and \
            (col_rec == col_original) and (col_chain == col_original)
        assert abs(dwwc_rec - dwwc_original).max() == pytest.approx(0, abs=1e-7)
        assert abs(dwwc_chain - dwwc_original).max() == pytest.approx(0, abs=1e-7)
    except AssertionError:
        print(metapath)
    return times

In [5]:
all_times = []

for metapath in tqdm.tqdm(metapaths):
    times = equal_outputs(metapath, dense_threshold=1)
    all_times.append(times)

100%|██████████| 1172/1172 [12:11<00:00,  1.60it/s]


In [6]:
df = pd.DataFrame(all_times, columns=['metapath', 'abbrev', 'dwwc', 'dwwc-nocache', 'recursive', 'recursive-nocache', 'chain', 'chain-nocache'])

In [7]:
df.to_csv('../data/rephetio-DWWCs-hetmat-runtime.tsv', sep='\t')

In [8]:
df.head()

Unnamed: 0,metapath,abbrev,dwwc,dwwc-nocache,recursive,recursive-nocache,chain,chain-nocache
0,(Compound - treats - Disease),CtD,0.024447,0.009796,0.014815,0.010053,0.005186,0.009526
1,(Compound - palliates - Disease),CpD,0.005504,0.004712,0.005456,0.004842,0.004865,0.004966
2,"(Compound - binds - Gene, Gene - upregulates -...",CbGuD,0.036345,0.041072,0.047547,0.032535,0.017528,0.030719
3,"(Compound - binds - Gene, Gene - associates - ...",CbGaD,0.022552,0.019919,0.021518,0.020633,0.020253,0.019373
4,"(Compound - binds - Gene, Gene - downregulates...",CbGdD,0.021729,0.019922,0.019714,0.01902,0.018035,0.020231


In [9]:
df.mean()

dwwc                 0.196255
dwwc-nocache         0.193026
recursive            0.030355
recursive-nocache    0.078141
chain                0.062251
chain-nocache        0.061979
dtype: float64

In [10]:
df.sum()

dwwc                 230.010670
dwwc-nocache         226.226659
recursive             35.575594
recursive-nocache     91.581720
chain                 72.958479
chain-nocache         72.639084
dtype: float64

In [11]:
from scipy import stats

In [12]:
stats.ttest_rel(df['dwwc-nocache'], df['dwwc'])

Ttest_relResult(statistic=-7.717528950433064, pvalue=2.5291152148560213e-14)

In [13]:
stats.ttest_rel(df['chain-nocache'], df['chain'])

Ttest_relResult(statistic=-2.4569034904213938, pvalue=0.014158274961216835)

## Example of typically long computations: Cache and no cache

In [14]:
%%timeit

_ = hetmech.degree_weight.dwwc(hetmat, 'CbGeAeGaD', dense_threshold=1)

1.82 ms ± 220 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [15]:
%%timeit

_ = hetmech.degree_weight.dwwc(hetmat_nocache, 'CbGeAeGaD', dense_threshold=1)

2.52 s ± 3.34 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [16]:
%%timeit

_ = hetmech.degree_weight.dwwc_recursive(hetmat, 'CbGeAeGaD', dense_threshold=1)

1.29 ms ± 9.04 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [17]:
%%timeit

_ = hetmech.degree_weight.dwwc_recursive(hetmat_nocache, 'CbGeAeGaD', dense_threshold=1)

342 ms ± 2.18 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [18]:
%%timeit

_ = hetmech.degree_weight.dwwc_chain(hetmat, 'CbGeAeGaD', dense_threshold=1)

274 ms ± 1.37 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [19]:
%%timeit

_ = hetmech.degree_weight.dwwc_chain(hetmat_nocache, 'CbGeAeGaD', dense_threshold=1)

274 ms ± 1.54 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Find the fastest method over all metapaths <= length 4

Manually stopped after 19 hours due to excessive expected runtime. It appears that `dwwc_chain` remains the fastest way to compute bulk metapaths.

In [None]:
hetmat_chain = hetmech.hetmat.HetMat('../data/hetionet-v1.0.hetmat/')
hetmat_rec = hetmech.hetmat.HetMat('../data/hetionet-v1.0.hetmat/')
hetmat_rec.path_counts_cache = hetmech.hetmat.PathCountPriorityCache(hetmat, allocate_GB=10)

times = [] 
for metapath in tqdm.tqdm(hetmat.metagraph.extract_all_metapaths(max_length=4)):
    recursive_cached = False
    if hetmat_rec.path_counts_cache.get(metapath, 'dwwc', 0.5):
        recursive_cached = True
        
    time1 = time.time() 
    hetmech.degree_weight.dwwc_chain(hetmat_chain, metapath, dense_threshold=1)
    time2 = time.time()
    hetmech.degree_weight.dwwc_recursive(hetmat_rec, metapath, dense_threshold=1)
    time3 = time.time()
    
    # metapath, chain time, recursive time, rec-was-cached
    times.append([metapath, time2 - time1, time3 - time2, recursive_cached])

In [21]:
dwwc_df = pd.DataFrame(times, columns=['metapath', 'dwwc-chain', 'dwwc-recursive', 'recursive_cached'])
dwwc_df.to_csv('../data/all-paths-DWWC-times.tsv', sep='\t')

In [22]:
dwwc_df.head()

Unnamed: 0,metapath,dwwc-chain,dwwc-recursive,recursive_cached
0,(Disease - treats - Compound),0.011341,0.012895,False
1,(Disease - presents - Symptom),0.006949,0.007117,False
2,(Disease - upregulates - Gene),0.019622,0.019295,False
3,(Disease - associates - Gene),0.008441,0.008628,False
4,(Disease - downregulates - Gene),0.00777,0.008748,False


In [29]:
dwwc_df.sum()

dwwc-chain          14552.158924
dwwc-recursive      55674.636784
recursive_cached      193.000000
dtype: float64