# Compute DWPCs for all metapaths

In [1]:
import numpy
import pandas
import pathlib
import scipy.sparse
import tqdm

import hetmech.degree_weight
from hetmech.hetmat import HetMat
import hetmech.hetmat.caching
import hetmech.matrix
import hetmech.degree_group
import pipeline

In [2]:
hetmat = HetMat('../../data/hetionet-v1.0.hetmat/')
hetmech.degree_weight.default_dwwc_method = hetmech.degree_weight.dwwc_recursive

All metapaths up to length 2 saving PC and DWPC matrices on disk -- actual hetnet.

In [3]:
metapaths = hetmat.metagraph.extract_metapaths('Disease', max_length=2)
len(metapaths)

88

### Compute path counts

In [4]:
%%time
hetmat.path_counts_cache = hetmech.hetmat.caching.PathCountPriorityCache(hetmat, allocate_GB=16)
for metapath in tqdm.tqdm(metapaths):
    row, col, dwpc_matrix = pipeline.compute_save_dwpc(hetmat, metapath, damping=0, dtype='uint64')
    del dwpc_matrix

100%|██████████| 88/88 [00:00<00:00, 97.33it/s]

CPU times: user 844 ms, sys: 45.8 ms, total: 890 ms
Wall time: 916 ms





### Compute DWPCs

In [5]:
%%time
hetmat.path_counts_cache = hetmech.hetmat.caching.PathCountPriorityCache(hetmat, allocate_GB=16)
mean_dwpcs = dict()
for metapath in tqdm.tqdm(metapaths):
    row, col, dwpc_matrix = pipeline.compute_save_dwpc(hetmat, metapath, damping=0.5)
    mean_dwpcs[(metapath, 'dwpc', 0.5)] = dwpc_matrix.mean()
    del dwpc_matrix

100%|██████████| 88/88 [00:01<00:00, 58.15it/s]

CPU times: user 1.32 s, sys: 88.8 ms, total: 1.41 s
Wall time: 1.52 s





In [6]:
hetmat.path_counts_cache = None

### Generate DGP files
One file per metapath per permutation

In [7]:
for name, permat in tqdm.tqdm(hetmat.permutations.items()):
    permat.path_counts_cache = hetmech.hetmat.caching.PathCountPriorityCache(permat, allocate_GB=16)
    for metapath in metapaths:
        path = permat.directory.joinpath('degree-grouped-path-counts', 'dwpc-0.5', f'{metapath}.tsv')
        if path.exists():
            pass
        else:
            degree_grouped_df = hetmech.degree_group.single_permutation_degree_group(
                permat, metapath, dwpc_mean=mean_dwpcs[(metapath, 'dwpc', 0.5)], damping=0.5)
            path.parent.mkdir(parents=True, exist_ok=True)
            degree_grouped_df.to_csv(path, sep='\t')
    permat.path_counts_cache = None

100%|██████████| 25/25 [00:00<00:00, 278.45it/s]


### Create multi-permutation DGP summary metrics.
One file per metapath

In [8]:
for metapath in metapaths:
    path = hetmat.directory.joinpath('adjusted-path-counts', 'dwpc-0.5', 
                                    'degree-grouped-permutations', f'{metapath}.tsv')
    if path.exists():
        pass
    else:
        degree_stats_df = hetmech.degree_group.summarize_degree_grouped_permutations(hetmat, metapath, damping=0.5)    
        path.parent.mkdir(parents=True, exist_ok=True)
        degree_stats_df.to_csv(path, sep='\t')

### Combine DWPC with DGP.
One file per metapath

In [9]:
%%time
for metapath in tqdm.tqdm(metapaths):
    path = hetmat.directory.joinpath('adjusted-path-counts', 'dwpc-0.5',
                                    'adjusted-dwpcs', f'{metapath}.tsv')
    full_metapath_df = pipeline.combine_dwpc_dgp(hetmat, metapath, 0.5)
    path.parent.mkdir(parents=True, exist_ok=True)
    full_metapath_df.to_csv(path, sep='\t', index=False)

100%|██████████| 88/88 [24:40<00:00, 16.83s/it]

CPU times: user 23min 40s, sys: 47.6 s, total: 24min 28s
Wall time: 24min 40s



