In [1]:
%load_ext autoreload
%autoreload 2

import collections
import time

import numpy as np
import pandas as pd
import pytest
import tqdm

import hetmech.degree_weight
import hetmech.hetmat

In [2]:
# Three hetmats so that there is no cache sharing
hetmat = hetmech.hetmat.HetMat('../data/hetionet-v1.0.hetmat/')
hetmat_rec = hetmech.hetmat.HetMat('../data/hetionet-v1.0.hetmat/')
hetmat_chain = hetmech.hetmat.HetMat('../data/hetionet-v1.0.hetmat/')
hetmat_chain_nomem = hetmech.hetmat.HetMat('../data/hetionet-v1.0.hetmat/')

In [3]:
# Rephetio metapaths
metapaths = list()
for metapath in hetmat.metagraph.extract_metapaths('Compound', 'Disease', max_length=4):
    if hetmech.degree_weight.categorize(metapath) in {'long_repeat', 'other'}:
        continue
    metapaths.append(metapath)
len(metapaths)

1172

In [4]:
def equal_outputs(metapath, dense_threshold=0, dtype=np.float64):
    """Compute DWWC using all three functions, and ensure that they give the same output."""
    time1 = time.time()
    
    # Recursive
    row_rec, col_rec, dwwc_rec = hetmech.degree_weight.dwwc_recursive(
        hetmat_rec, metapath, dense_threshold=dense_threshold, dtype=dtype)
    rec_time = time.time()
    
    # Chain ordering I
    row_chain, col_chain, dwwc_chain = hetmech.degree_weight.dwwc_chain(
        hetmat_chain, metapath, dense_threshold=dense_threshold, dtype=dtype)
    chain_time = time.time()
    
    # Chain ordering II
    row_chain_nomem, col_chain_nomem, dwwc_chain_nomem = hetmech.degree_weight.dwwc_chain_nomem(
        hetmat_chain_nomem, metapath, dense_threshold=dense_threshold, dtype=dtype)
    chain_nomem_time = time.time()
    
    # Original DWWC method
    row_original, col_original, dwwc_original = hetmech.degree_weight.dwwc(
        hetmat, metapath, dense_threshold=dense_threshold, dtype=dtype)
    original_time = time.time()
    
    times = [rec_time - time1, chain_time - rec_time, chain_nomem_time - chain_time, original_time - chain_nomem_time]
    
    # Ensure identical outputs
    try:
        assert (row_rec == row_original) and (row_chain == row_original) and \
            (col_rec == col_original) and (col_chain == col_original) and \
            (row_chain_nomem == row_original) and (col_chain_nomem == col_original)
        assert abs(dwwc_rec - dwwc_original).max() == pytest.approx(0, rel=1e-7)
        assert abs(dwwc_chain - dwwc_original).max() == pytest.approx(0, abs=1e-7)
        assert abs(dwwc_chain_nomem - dwwc_original).max() == pytest.approx(0, abs=1e-7)
    except AssertionError:
        print(metapath)
    return times

In [5]:
all_times = []

for metapath in tqdm.tqdm(metapaths):
    times = equal_outputs(metapath, dense_threshold=1)
    all_times.append(times)

100%|██████████| 1172/1172 [07:38<00:00,  2.55it/s]


In [6]:
df = pd.DataFrame(all_times, columns=['recursive', 'chain', 'chain_nomem', 'original'])

In [7]:
df.head()

Unnamed: 0,recursive,chain,chain_nomem,original
0,0.014322,0.010649,0.009829,0.008086
1,0.004778,0.004608,0.004798,0.004352
2,0.011282,0.011545,0.010835,0.010757
3,0.013559,0.01231,0.01092,0.010578
4,0.029455,0.029159,0.028978,0.027812


In [8]:
df.mean()

recursive      0.080130
chain          0.061009
chain_nomem    0.060359
original       0.186873
dtype: float64

In [9]:
df.sum()

recursive       93.912910
chain           71.502365
chain_nomem     70.740806
original       219.015376
dtype: float64

In [10]:
from scipy import stats

In [11]:
stats.ttest_rel(df['recursive'], df['original'])

Ttest_relResult(statistic=-12.846961209523942, pvalue=1.9544230003882432e-35)

In [12]:
stats.ttest_rel(df['chain'], df['original'])

Ttest_relResult(statistic=-14.917203945476476, pvalue=3.3515738364617213e-46)

In [13]:
stats.ttest_rel(df['chain'], df['chain_nomem'])

Ttest_relResult(statistic=6.358996893462328, pvalue=2.9050783090352536e-10)

## Example of typically long computations

In [14]:
%%timeit

_ = hetmech.degree_weight.dwwc(hetmat, 'CbGeAeGaD', dense_threshold=0)

2.87 s ± 135 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [15]:
%%timeit

_ = hetmech.degree_weight.dwwc(hetmat, 'CbGeAeGaD', dense_threshold=1)

2.6 s ± 77.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [16]:
%%timeit

_ = hetmech.degree_weight.dwwc_recursive(hetmat, 'CbGeAeGaD', dense_threshold=0)

2.06 s ± 8.11 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [17]:
%%timeit

_ = hetmech.degree_weight.dwwc_recursive(hetmat, 'CbGeAeGaD', dense_threshold=1)

357 ms ± 4.24 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [18]:
%%timeit

_ = hetmech.degree_weight.dwwc_chain(hetmat, 'CbGeAeGaD', dense_threshold=0)

2.08 s ± 13.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [19]:
%%timeit

_ = hetmech.degree_weight.dwwc_chain(hetmat, 'CbGeAeGaD', dense_threshold=1)

292 ms ± 706 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [20]:
%%timeit

_ = hetmech.degree_weight.dwwc_chain_nomem(hetmat, 'CbGeAeGaD', dense_threshold=0)

2.12 s ± 91.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [21]:
%%timeit

_ = hetmech.degree_weight.dwwc_chain_nomem(hetmat, 'CbGeAeGaD', dense_threshold=1)

292 ms ± 2.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
