In [1]:
%load_ext autoreload
%autoreload 2

import time

import numpy as np
import pandas as pd
import pytest
import tqdm

import hetmech.degree_weight
import hetmech.hetmat
import hetmech.hetmat.caching

In [2]:
# Three hetmats so that there is no cache sharing
hetmat = hetmech.hetmat.HetMat('../data/hetionet-v1.0.hetmat/')
hetmat.path_counts_cache = hetmech.hetmat.caching.PathCountPriorityCache(hetmat, allocate_GB=5)

hetmat_nocache = hetmech.hetmat.HetMat('../data/hetionet-v1.0.hetmat/')

hetmat_rec = hetmech.hetmat.HetMat('../data/hetionet-v1.0.hetmat/')
hetmat_rec.path_counts_cache = hetmech.hetmat.caching.PathCountPriorityCache(hetmat_rec, allocate_GB=5)

hetmat_rec_nocache = hetmech.hetmat.HetMat('../data/hetionet-v1.0.hetmat/')

hetmat_chain = hetmech.hetmat.HetMat('../data/hetionet-v1.0.hetmat/')
hetmat_rec.path_counts_cache = hetmech.hetmat.caching.PathCountPriorityCache(hetmat_chain, allocate_GB=5)

hetmat_chain_nocache = hetmech.hetmat.HetMat('../data/hetionet-v1.0.hetmat/')

In [3]:
# Rephetio metapaths
metapaths = list()
for metapath in hetmat.metagraph.extract_metapaths('Compound', 'Disease', max_length=4):
    if hetmech.degree_weight.categorize(metapath) in {'long_repeat', 'other'}:
        continue
    metapaths.append(metapath)
len(metapaths)

1172

In [4]:
def equal_outputs(metapath, dense_threshold=0, dtype=np.float64):
    """Compute DWWC using all three functions, and ensure that they give the same output."""
    time1 = time.time()
    
    # Original DWWC method with cache
    row_original, col_original, dwwc_original = hetmech.degree_weight.dwwc(
        hetmat, metapath, dense_threshold=dense_threshold, dtype=dtype)
    time2 = time.time()
    
    # Original DWWC method no cache
    row_original, col_original, dwwc_original = hetmech.degree_weight.dwwc(
        hetmat_nocache, metapath, dense_threshold=dense_threshold, dtype=dtype)
    time3 = time.time()
    
    # Recursive with cache
    row_rec, col_rec, dwwc_rec = hetmech.degree_weight.dwwc_recursive(
        hetmat_rec, metapath, dense_threshold=dense_threshold, dtype=dtype)
    time4 = time.time()
    
    # Recursive no cache
    row_rec, col_rec, dwwc_rec = hetmech.degree_weight.dwwc_recursive(
        hetmat_rec_nocache, metapath, dense_threshold=dense_threshold, dtype=dtype)
    time5 = time.time()
    
    # Chain ordering with cache
    row_chain, col_chain, dwwc_chain = hetmech.degree_weight.dwwc_chain(
        hetmat_chain, metapath, dense_threshold=dense_threshold, dtype=dtype)
    time6 = time.time()
    
    # Chain ordering no cache
    row_chain, col_chain, dwwc_chain = hetmech.degree_weight.dwwc_chain(
        hetmat_chain_nocache, metapath, dense_threshold=dense_threshold, dtype=dtype)
    time7 = time.time()
    
    # Metapath \ abbrev \ dwwc \ dwwc (nocache) \ recursive \ recursive (nocache) \ chain ordering \ chain ordering (nocache)
    times = [metapath, str(metapath), time2 - time1, time3 - time2, time4 - time3, time5 - time4, time6 - time5, time7 - time6]
    
    # Ensure identical outputs
    try:
        assert (row_rec == row_original) and (row_chain == row_original) and \
            (col_rec == col_original) and (col_chain == col_original)
        assert abs(dwwc_rec - dwwc_original).max() == pytest.approx(0, abs=1e-7)
        assert abs(dwwc_chain - dwwc_original).max() == pytest.approx(0, abs=1e-7)
    except AssertionError:
        print(metapath)
    return times

In [5]:
all_times = []

for metapath in tqdm.tqdm(metapaths):
    times = equal_outputs(metapath, dense_threshold=1)
    all_times.append(times)

100%|██████████| 1172/1172 [11:26<00:00,  1.71it/s]


In [6]:
df = pd.DataFrame(all_times, columns=['metapath', 'abbrev', 'dwwc', 'dwwc-nocache', 'recursive', 'recursive-nocache', 'chain', 'chain-nocache'])

In [7]:
df.to_csv('../data/rephetio-DWWCs-hetmat-runtime.tsv', sep='\t', index=False, float_format='%.5g')

In [8]:
df.head()

Unnamed: 0,metapath,abbrev,dwwc,dwwc-nocache,recursive,recursive-nocache,chain,chain-nocache
0,(Compound - treats - Disease),CtD,0.012113,0.010203,0.015093,0.009698,0.005038,0.009103
1,(Compound - palliates - Disease),CpD,0.005205,0.004499,0.00495,0.005305,0.004784,0.004607
2,"(Compound - binds - Gene, Gene - upregulates -...",CbGuD,0.02866,0.027946,0.044667,0.031032,0.015608,0.029849
3,"(Compound - binds - Gene, Gene - associates - ...",CbGaD,0.019062,0.018711,0.020574,0.020411,0.019298,0.019199
4,"(Compound - binds - Gene, Gene - downregulates...",CbGdD,0.022728,0.017297,0.018483,0.01762,0.019113,0.016835


In [9]:
df.mean()

dwwc                 0.175063
dwwc-nocache         0.169251
recursive            0.030078
recursive-nocache    0.077846
chain                0.065509
chain-nocache        0.065507
dtype: float64

In [10]:
df.sum()

dwwc                 205.173711
dwwc-nocache         198.362262
recursive             35.251375
recursive-nocache     91.235952
chain                 76.777119
chain-nocache         76.773913
dtype: float64

In [11]:
from scipy import stats

In [12]:
stats.ttest_rel(df['dwwc-nocache'], df['dwwc'])

Ttest_relResult(statistic=-36.79049850804164, pvalue=1.461763495606208e-197)

In [13]:
stats.ttest_rel(df['chain-nocache'], df['chain'])

Ttest_relResult(statistic=-0.0363549819336121, pvalue=0.9710055058688127)

## Example of typically long computations: Cache and no cache

In [14]:
%%timeit

_ = hetmech.degree_weight.dwwc(hetmat, 'CbGeAeGaD', dense_threshold=1)

1.61 ms ± 6.92 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [15]:
%%timeit

_ = hetmech.degree_weight.dwwc(hetmat_nocache, 'CbGeAeGaD', dense_threshold=1)

2.32 s ± 11.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [16]:
%%timeit

_ = hetmech.degree_weight.dwwc_recursive(hetmat, 'CbGeAeGaD', dense_threshold=1)

1.62 ms ± 15.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [17]:
%%timeit

_ = hetmech.degree_weight.dwwc_recursive(hetmat_nocache, 'CbGeAeGaD', dense_threshold=1)

357 ms ± 11.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [18]:
%%timeit

_ = hetmech.degree_weight.dwwc_chain(hetmat, 'CbGeAeGaD', dense_threshold=1)

1.72 ms ± 112 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [19]:
%%timeit

_ = hetmech.degree_weight.dwwc_chain(hetmat_nocache, 'CbGeAeGaD', dense_threshold=1)

324 ms ± 2.14 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Find the fastest method over all metapaths <= length 4

Manually stopped after 19 hours due to excessive expected runtime. It appears that `dwwc_chain` remains the fastest way to compute bulk metapaths.

In [20]:
hetmat_chain = hetmech.hetmat.HetMat('../data/hetionet-v1.0.hetmat/')
hetmat_rec = hetmech.hetmat.HetMat('../data/hetionet-v1.0.hetmat/')
hetmat_rec.path_counts_cache = hetmech.hetmat.caching.PathCountPriorityCache(hetmat_rec, allocate_GB=10)

times = [] 
for metapath in tqdm.tqdm(hetmat.metagraph.extract_all_metapaths(max_length=4)):
    recursive_cached = False
    if hetmat_rec.path_counts_cache.get(metapath, 'dwwc', 0.5):
        recursive_cached = True
        
    time1 = time.time() 
    hetmech.degree_weight.dwwc_chain(hetmat_chain, metapath, dense_threshold=1)
    time2 = time.time()
    hetmech.degree_weight.dwwc_recursive(hetmat_rec, metapath, dense_threshold=1)
    time3 = time.time()
    
    # metapath, chain time, recursive time, rec-was-cached
    times.append([metapath, time2 - time1, time3 - time2, recursive_cached])

100%|██████████| 38894/38894 [35:00:58<00:00,  3.24s/it]   


In [21]:
dwwc_df = pd.DataFrame(times, columns=['metapath', 'dwwc-chain', 'dwwc-recursive', 'recursive_cached'])
dwwc_df.to_csv('../explore/all-paths-DWWC-times.tsv', sep='\t', index=False, float_format='%.5g')

In [22]:
dwwc_df.head()

Unnamed: 0,metapath,dwwc-chain,dwwc-recursive,recursive_cached
0,(Anatomy - upregulates - Gene),0.036298,0.037633,False
1,(Anatomy - downregulates - Gene),0.017841,0.017111,False
2,(Anatomy - localizes - Disease),0.00854,0.009807,False
3,(Anatomy - expresses - Gene),0.051133,0.040807,False
4,"(Anatomy - upregulates - Gene, Gene - upregula...",0.035236,0.032447,False


In [23]:
dwwc_df.sum()

dwwc-chain          31940.143505
dwwc-recursive      94078.729804
recursive_cached      166.000000
dtype: float64