# Compute DWPCs for all metapaths

In [1]:
import itertools

import numpy
import pandas
import pathlib
import scipy.sparse
import tqdm

import hetmech.degree_weight
from hetmech.hetmat import HetMat
import hetmech.hetmat.caching
import hetmech.matrix
import hetmech.degree_group
import hetmech.pipeline

In [2]:
hetmat = HetMat('../../data/hetionet-v1.0.hetmat/')
hetmech.degree_weight.default_dwwc_method = hetmech.degree_weight.dwwc_recursive

In [3]:
# All metapaths up to length 2
metapaths = hetmat.metagraph.extract_metapaths('Disease', max_length=2)
metapaths = ['CbGaD', 'GpBPpGiG']
len(metapaths)

2

### Compute path counts

Note that we probably want to make this more intelligent to not read then write inverse of an existing on-disk metapath.

In [4]:
%%time
hetmat.path_counts_cache = hetmech.hetmat.caching.PathCountPriorityCache(hetmat, allocate_GB=16)
for metapath in tqdm.tqdm(metapaths):
    row_ids, col_ids, pc_matrix = hetmech.degree_weight.dwpc(hetmat, metapath, damping=0, dense_threshold=0.7, dtype='uint64')
    path = hetmat.get_path_counts_path(metapath, 'dwpc', 0, None)
    if not path.exists():
        hetmech.hetmat.save_matrix(pc_matrix, path)
    del pc_matrix

100%|██████████| 2/2 [05:14<00:00, 157.31s/it]

CPU times: user 5min 10s, sys: 3.32 s, total: 5min 13s
Wall time: 5min 14s





In [5]:
print(hetmat.path_counts_cache.get_stats())
hetmat.path_counts_cache = None

PathCountPriorityCache containing 2 items
  total gets: 2
  cache hits: memory = 0, disk = 2, absent = 0
  2.06 GB in use of 16.00 GB allocated


### Compute DWPCs

In [6]:
%%time
hetmat.path_counts_cache = hetmech.hetmat.caching.PathCountPriorityCache(hetmat, allocate_GB=16)
mean_dwpcs = dict()
for metapath in tqdm.tqdm(metapaths):
    row_ids, col_ids, dwpc_matrix = hetmech.degree_weight.dwpc(hetmat, metapath, damping=0.5, dense_threshold=0.7, dtype='float64')
    mean_dwpcs[(metapath, 'dwpc', 0.5)] = dwpc_matrix.mean()
    path = hetmat.get_path_counts_path(metapath, 'dwpc', 0.5, None)
    if not path.exists():
        hetmech.hetmat.save_matrix(dwpc_matrix, path)
    del dwpc_matrix

100%|██████████| 2/2 [05:22<00:00, 161.04s/it]

CPU times: user 5min 11s, sys: 8.44 s, total: 5min 19s
Wall time: 5min 22s





In [7]:
print(hetmat.path_counts_cache.get_stats())
hetmat.path_counts_cache = None

PathCountPriorityCache containing 2 items
  total gets: 2
  cache hits: memory = 0, disk = 2, absent = 0
  2.06 GB in use of 16.00 GB allocated


### Generate running DGP metrics

In [8]:
for name, permat in tqdm.tqdm(hetmat.permutations.items()):
    permat.path_counts_cache = hetmech.hetmat.caching.PathCountPriorityCache(permat, allocate_GB=16)
    for metapath in metapaths:
        degree_grouped_df = hetmech.degree_group.single_permutation_degree_group(
            permat, metapath, dwpc_mean=mean_dwpcs[(metapath, 'dwpc', 0.5)], damping=0.5)
        path = hetmat.get_running_degree_group_path(metapath, 'dwpc', 0.5, extension='.pkl')
        if path.exists():
            running_df = pandas.read_pickle(path)
            running_df += degree_grouped_df
        else:
            running_df = degree_grouped_df
        running_df.to_pickle(path)
    permat.path_counts_cache = None

# Replace .pkl files with .tsv.gz files.
for metapath in metapaths:
    old_path = hetmat.get_running_degree_group_path(metapath, 'dwpc', 0.5, extension='.pkl')
    df = pandas.read_pickle(old_path)
    new_path = hetmat.get_running_degree_group_path(metapath, 'dwpc', 0.5, extension='.tsv.gz')
    df.to_csv(new_path, sep='\t', compression='gzip')
    old_path.unlink()

100%|██████████| 200/200 [1:59:09<00:00, 35.75s/it]


### Combine DWPC with DGP & calculate p-values

In [6]:
def grouper(iterable, group_size):
    """
    Group an iterable into chunks of group_size.
    Derived from https://stackoverflow.com/a/8998040/4651668
    """
    iterable = iter(iterable)
    while True:
        chunk = itertools.islice(iterable, group_size)
        try:
            head = next(chunk),
        except StopIteration:
            break
        yield itertools.chain(head, chunk)

def grouped_tsv_writer(row_generator, path, group_size=20_000, sep='\t', index=False, **kwargs):
    """
    Write an iterable of dictionaries to a TSV, where each dictionary is a row.
    """
    chunks = grouper(row_generator, group_size=group_size)
    for i, chunk in enumerate(chunks):
        df = pandas.DataFrame.from_records(chunk)
        kwargs['header'] = not bool(i)
        kwargs['mode'] = 'a' if i else 'w'
        df.to_csv(path, sep=sep, index=index, **kwargs)

In [None]:
%%time
for metapath in tqdm.tqdm(metapaths):
    dwpcs_rows = hetmech.pipeline.combine_dwpc_dgp(hetmat, metapath, damping=0.5, ignore_zeros=True, max_p_value=0.05)
    path = hetmat.directory.joinpath('adjusted-path-counts', 'dwpc-0.5',
                                     'adjusted-dwpcs', f'{metapath}-filtered.tsv.gz')
    grouped_tsv_writer(dwpcs_rows, path, float_format='%.7g', compression='gzip')

 50%|█████     | 1/2 [00:00<00:00,  1.31it/s]