In [1]:
%load_ext autoreload
%autoreload 2

import collections
import time

import numpy as np
import pandas as pd
import pytest
import tqdm

import hetmech.degree_weight
import hetmech.hetmat

In [2]:
# Three hetmats so that there is no cache sharing
hetmat = hetmech.hetmat.HetMat('../data/hetionet-v1.0.hetmat/')
hetmat_rec = hetmech.hetmat.HetMat('../data/hetionet-v1.0.hetmat/')
hetmat_chain = hetmech.hetmat.HetMat('../data/hetionet-v1.0.hetmat/')
hetmat_chain_nomem = hetmech.hetmat.HetMat('../data/hetionet-v1.0.hetmat/')

In [3]:
# Rephetio metapaths
metapaths = list()
for metapath in hetmat.metagraph.extract_metapaths('Compound', 'Disease', max_length=4):
    if hetmech.degree_weight.categorize(metapath) in {'long_repeat', 'other'}:
        continue
    metapaths.append(metapath)
len(metapaths)

1172

In [4]:
def equal_outputs(metapath, dense_threshold=0, dtype=np.float64):
    """Compute DWWC using all three functions, and ensure that they give the same output."""
    time1 = time.time()
    
    # Recursive
    row_rec, col_rec, dwwc_rec = hetmech.degree_weight.dwwc_recursive(
        hetmat_rec, metapath, dense_threshold=dense_threshold, dtype=dtype)
    rec_time = time.time()
    
    # Chain ordering I
    row_chain, col_chain, dwwc_chain = hetmech.degree_weight.dwwc_chain(
        hetmat_chain, metapath, dense_threshold=dense_threshold, dtype=dtype)
    chain_time = time.time()
    
    # Chain ordering II
    row_chain_nomem, col_chain_nomem, dwwc_chain_nomem = hetmech.degree_weight.dwwc_chain_nomem(
        hetmat_chain_nomem, metapath, dense_threshold=dense_threshold, dtype=dtype)
    chain_nomem_time = time.time()
    
    # Original DWWC method
    row_original, col_original, dwwc_original = hetmech.degree_weight.dwwc(
        hetmat, metapath, dense_threshold=dense_threshold, dtype=dtype)
    original_time = time.time()
    
    times = [rec_time - time1, chain_time - rec_time, chain_nomem_time - chain_time, original_time - chain_nomem_time]
    
    # Ensure identical outputs
    try:
        assert (row_rec == row_original) and (row_chain == row_original) and \
            (col_rec == col_original) and (col_chain == col_original) and \
            (row_chain_nomem == row_original) and (col_chain_nomem == col_original)
        assert abs(dwwc_rec - dwwc_original).max() == pytest.approx(0, rel=1e-7)
        assert abs(dwwc_chain - dwwc_original).max() == pytest.approx(0, abs=1e-7)
        assert abs(dwwc_chain_nomem - dwwc_original).max() == pytest.approx(0, abs=1e-7)
    except AssertionError:
        print(metapath)
    return times

In [5]:
all_times = []

for metapath in tqdm.tqdm(metapaths):
    times = equal_outputs(metapath, dense_threshold=1)
    all_times.append(times)

100%|██████████| 1172/1172 [07:33<00:00,  2.58it/s]


In [6]:
df = pd.DataFrame(all_times, columns=['recursive', 'chain', 'chain_nomem', 'original'])

In [7]:
df.head()

Unnamed: 0,recursive,chain,chain_nomem,original
0,0.012709,0.013652,0.010368,0.008819
1,0.004764,0.00465,0.004457,0.004288
2,0.011034,0.010399,0.010551,0.010422
3,0.013813,0.011681,0.011337,0.010604
4,0.030713,0.029939,0.030824,0.029727


In [8]:
df.mean()

recursive      0.077462
chain          0.059921
chain_nomem    0.059876
original       0.186503
dtype: float64

In [9]:
df.sum()

recursive       90.785415
chain           70.227411
chain_nomem     70.174735
original       218.581609
dtype: float64

In [10]:
from scipy import stats

In [11]:
stats.ttest_rel(df['recursive'], df['original'])

Ttest_relResult(statistic=-13.032249141497845, pvalue=2.3695538411337612e-36)

In [12]:
stats.ttest_rel(df['chain'], df['original'])

Ttest_relResult(statistic=-14.941800042886044, pvalue=2.458548903130906e-46)

In [13]:
stats.ttest_rel(df['chain'], df['chain_nomem'])

Ttest_relResult(statistic=0.5015696846074758, pvalue=0.616064529574426)

## Example of typically long computations

In [14]:
%%timeit

_ = hetmech.degree_weight.dwwc(hetmat, 'CbGeAeGaD', dense_threshold=0)

2.76 s ± 29.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [15]:
%%timeit

_ = hetmech.degree_weight.dwwc(hetmat, 'CbGeAeGaD', dense_threshold=1)

2.54 s ± 6.59 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [16]:
%%timeit

_ = hetmech.degree_weight.dwwc_recursive(hetmat, 'CbGeAeGaD', dense_threshold=0)

2.08 s ± 14.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [17]:
%%timeit

_ = hetmech.degree_weight.dwwc_recursive(hetmat, 'CbGeAeGaD', dense_threshold=1)

357 ms ± 3.91 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [18]:
%%timeit

_ = hetmech.degree_weight.dwwc_chain(hetmat, 'CbGeAeGaD', dense_threshold=0)

2.08 s ± 16.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [19]:
%%timeit

_ = hetmech.degree_weight.dwwc_chain(hetmat, 'CbGeAeGaD', dense_threshold=1)

293 ms ± 1.29 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [20]:
%%timeit

_ = hetmech.degree_weight.dwwc_chain_nomem(hetmat, 'CbGeAeGaD', dense_threshold=0)

2.05 s ± 12.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [21]:
%%timeit

_ = hetmech.degree_weight.dwwc_chain_nomem(hetmat, 'CbGeAeGaD', dense_threshold=1)

290 ms ± 3.01 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Use the fastest method over all metapaths <= length 4

In [22]:
times = [] 
for metapath in tqdm.tqdm(hetmat.metagraph.extract_all_metapaths(max_length=4)): 
    time1 = time.time() 
    hetmech.degree_weight.dwwc_chain_nomem(hetmat, metapath, dense_threshold=1) 
    times.append([metapath, time.time() - time1])

100%|██████████| 38894/38894 [10:59:45<00:00,  1.02s/it]


In [23]:
dwwc_df = pd.DataFrame(times, columns=['metapath', 'dwwc-time'])