# Compute DWPCs for all metapaths

In [1]:
import pathlib
import shutil
import zipfile

import pandas
import tqdm

import hetmech.degree_weight
from hetmech.hetmat import HetMat
import hetmech.hetmat.caching
import hetmech.degree_group
import hetmech.pipeline
from hetmech.hetmat.archive import create_archive, create_archive_by_globs

In [2]:
hetmat = HetMat('../../data/hetionet-v1.0.hetmat/')
hetmech.degree_weight.default_dwwc_method = hetmech.degree_weight.dwwc_recursive

In [3]:
# set DWPC damping exponent
damping = 0.5

# if danger is True, delete existing path-counts, adjusted-path-counts, and archives
danger = True

In [4]:
# All metapaths up to length 3
metapaths = hetmat.metagraph.extract_all_metapaths(max_length=3, exclude_inverts=True)
len(metapaths)

2205

In [5]:
# # Overwrite metapaths for testing purposes
# metapaths = ['DrD', 'SpDpS']
# metapaths = list(map(hetmat.metagraph.get_metapath, metapaths))
# metapaths

In [6]:
if danger:
    # danger: delete existing path-counts and adjusted-path-counts. Potentially not neccessary
    for delete_dir in hetmat.directory.glob('path-counts/*'):
        shutil.rmtree(delete_dir)
    for delete_dir in hetmat.directory.glob('adjusted-path-counts/*'):
        shutil.rmtree(delete_dir)

### Compute path counts

Note that we probably want to make this more intelligent to not read then write inverse of an existing on-disk metapath.

In [7]:
hetmat.path_counts_cache = hetmech.hetmat.caching.PathCountPriorityCache(hetmat, allocate_GB=16)
for metapath in tqdm.tqdm(metapaths):
    row_ids, col_ids, pc_matrix = hetmech.degree_weight.dwpc(hetmat, metapath, damping=0, dense_threshold=1, dtype='uint64')
    path = hetmat.get_path_counts_path(metapath, 'dwpc', 0, None)
    if not path.exists():
        hetmech.hetmat.save_matrix(pc_matrix, path)
    del pc_matrix

100%|██████████| 2205/2205 [6:49:11<00:00, 11.13s/it]


In [8]:
print(hetmat.path_counts_cache.get_stats())
hetmat.path_counts_cache = None

PathCountPriorityCache containing 139 items
  total gets: 4,907
  cache hits: memory = 2,378, disk = 324, absent = 2,205
  15.45 GB in use of 16.00 GB allocated


### Compute DWPCs

In [9]:
hetmat.path_counts_cache = hetmech.hetmat.caching.PathCountPriorityCache(hetmat, allocate_GB=16)
mean_dwpcs = dict()
for metapath in tqdm.tqdm(metapaths):
    row_ids, col_ids, dwpc_matrix = hetmech.degree_weight.dwpc(hetmat, metapath, damping=damping, dense_threshold=0.7, dtype='float64')
    mean_dwpcs[(metapath, 'dwpc', damping)] = dwpc_matrix.mean()
    path = hetmat.get_path_counts_path(metapath, 'dwpc', damping, None)
    if not path.exists():
        hetmech.hetmat.save_matrix(dwpc_matrix, path)
    del dwpc_matrix

100%|██████████| 2205/2205 [6:46:10<00:00, 11.05s/it]


In [10]:
print(hetmat.path_counts_cache.get_stats())
hetmat.path_counts_cache = None

PathCountPriorityCache containing 139 items
  total gets: 4,907
  cache hits: memory = 2,378, disk = 324, absent = 2,205
  15.45 GB in use of 16.00 GB allocated


### Generate running DGP metrics

In [None]:
for name, permat in tqdm.tqdm(hetmat.permutations.items()):
    permat.path_counts_cache = hetmech.hetmat.caching.PathCountPriorityCache(permat, allocate_GB=16)
    for metapath in metapaths:
        degree_grouped_df = hetmech.degree_group.single_permutation_degree_group(
            permat, metapath, dwpc_mean=mean_dwpcs[(metapath, 'dwpc', damping)], damping=damping)
        path = hetmat.get_running_degree_group_path(metapath, 'dwpc', damping, extension='.pkl')
        path.parent.mkdir(parents=True, exist_ok=True)
        if path.exists():
            running_df = pandas.read_pickle(path)
            running_df += degree_grouped_df
        else:
            running_df = degree_grouped_df
        running_df.to_pickle(path)
    permat.path_counts_cache = None

# Replace .pkl files with .tsv.gz files.
for metapath in metapaths:
    old_path = hetmat.get_running_degree_group_path(metapath, 'dwpc', damping, extension='.pkl')
    df = pandas.read_pickle(old_path)
    new_path = hetmat.get_running_degree_group_path(metapath, 'dwpc', damping, extension='.tsv.gz')
    df.to_csv(new_path, sep='\t', compression='gzip')
    old_path.unlink()

 50%|████▉     | 99/200 [157:22:17<160:33:02, 5722.60s/it]

### Combine DWPC with DGP & calculate p-values

In [None]:
for metapath in tqdm.tqdm(metapaths):
    dwpcs_rows = hetmech.pipeline.combine_dwpc_dgp(hetmat, metapath, damping=damping, ignore_zeros=True, max_p_value=0.01)
    path = hetmat.directory.joinpath('adjusted-path-counts', f'dwpc-{damping}',
                                     'adjusted-dwpcs', f'{metapath}-filtered.tsv.gz')
    path.parent.mkdir(parents=True, exist_ok=True)
    hetmech.pipeline.grouped_tsv_writer(dwpcs_rows, path, float_format='%.7g', compression='gzip')

## Create archive

In [None]:
length_to_metapaths = dict()
for metapath in metapaths:
    metapath = hetmat.metagraph.get_metapath(metapath)
    length_to_metapaths.setdefault(len(metapath), []).append(metapath)
# Number of metapaths for each length
{k: len(v) for k, v in length_to_metapaths.items()}
length_to_metapaths

In [None]:
metapaths

In [None]:
# Configure archive options
split_size = 50

In [None]:
if danger:
    # Delete existing archives
    archive_dir = pathlib.Path('archives')
    if archive_dir.exists():
        shutil.rmtree('archives')
    archive_dir.mkdir()

# Separate archives for each metapath length
for length, metapath_group in length_to_metapaths.items():
    # Create degree grouped archive
    degree_grouped_paths = [
        hetmat.get_running_degree_group_path(metapath, 'dwpc', damping, extension='.tsv.gz').relative_to(hetmat.directory)
        for metapath in metapath_group
    ]
    create_archive(
        destination_path=f'archives/degree-grouped-perms_length-{length}_damping-{damping}.zip',
        root_directory=hetmat.directory,
        source_paths=degree_grouped_paths,
        compression=zipfile.ZIP_STORED,
        split_size=split_size,
    )
    # Create PC archive
    pc_globs = [
        str(hetmat.get_path_counts_path(metapath, 'dwpc', 0, None).relative_to(hetmat.directory)) + '.*'
        for metapath in metapath_group
    ]
    create_archive_by_globs(
        destination_path=f'archives/dwpcs_length-{length}_damping-0.0.zip',
        root_directory=hetmat.directory,
        include_globs=pc_globs,
        compression=zipfile.ZIP_STORED,
        split_size=split_size,
    )
    # Create DWPC archive
    dwpc_globs = [
        str(hetmat.get_path_counts_path(metapath, 'dwpc', damping, None).relative_to(hetmat.directory)) + '.*'
        for metapath in metapath_group
    ]
    create_archive_by_globs(
        destination_path=f'archives/dwpcs_length-{length}_damping-{damping}.zip',
        root_directory=hetmat.directory,
        include_globs=dwpc_globs,
        compression=zipfile.ZIP_STORED,
        split_size=split_size,
    )