# Compute DWPCs for all metapaths

In [1]:
import collections
import pathlib
import shutil
import zipfile

import numpy
import pandas
import tqdm

import hetmech.degree_weight
from hetmech.hetmat import HetMat
import hetmech.hetmat.caching
import hetmech.degree_group
import hetmech.pipeline
from hetmech.hetmat.archive import create_archive, create_archive_by_globs

In [2]:
hetmat = HetMat('../../data/hetionet-v1.0.hetmat/')
hetmech.degree_weight.default_dwwc_method = hetmech.degree_weight.dwwc_recursive

In [3]:
# set DWPC damping exponent
damping = 0.5

# if danger is True, delete existing path-counts, adjusted-path-counts, and archives
danger = False
clear_adjusted_pcs = True

In [4]:
# All metapaths up to length 3
metapaths = hetmat.metagraph.extract_all_metapaths(max_length=3, exclude_inverts=True)
len(metapaths)

2205

In [5]:
# # Overwrite metapaths for testing purposes
# metapaths = ['DrD', 'SpDpS']
# metapaths = list(map(hetmat.metagraph.get_metapath, metapaths))
# metapaths

In [6]:
metapath_to_stats = dict()

In [7]:
if danger:
    for delete_dir in hetmat.directory.glob('path-counts/*'):
        shutil.rmtree(delete_dir)
if danger or clear_adjusted_pcs:
    for delete_dir in hetmat.directory.glob('adjusted-path-counts/*'):
        shutil.rmtree(delete_dir)

### Compute path counts

Note that we probably want to make this more intelligent to not read then write inverse of an existing on-disk metapath.

In [8]:
hetmat.path_counts_cache = hetmech.hetmat.caching.PathCountPriorityCache(hetmat, allocate_GB=16)
for metapath in tqdm.tqdm(metapaths):
    row_ids, col_ids, pc_matrix = hetmech.degree_weight.dwpc(hetmat, metapath, damping=0, dense_threshold=1, dtype='uint64')
    path = hetmat.get_path_counts_path(metapath, 'dwpc', 0, None)
    if not path.exists():
        hetmech.hetmat.save_matrix(pc_matrix, path)
    metapath_to_stats[metapath] = collections.OrderedDict({
        'metapath': str(metapath),
        'length': len(metapath),
        'n_pairs': numpy.prod(pc_matrix.shape),
        'pc_density': pc_matrix.count_nonzero() / numpy.prod(pc_matrix.shape),
        'pc_mean': pc_matrix.mean(),
        'pc_max': pc_matrix.max(),
    })
    del pc_matrix

100%|██████████| 2205/2205 [6:42:33<00:00, 10.95s/it]  


In [9]:
print(hetmat.path_counts_cache.get_stats())
hetmat.path_counts_cache = None

PathCountPriorityCache containing 103 items
  total gets: 2,205
  cache hits: memory = 0, disk = 2,205, absent = 0
  15.33 GB in use of 16.00 GB allocated


### Compute DWPCs

In [10]:
hetmat.path_counts_cache = hetmech.hetmat.caching.PathCountPriorityCache(hetmat, allocate_GB=16)
for metapath in tqdm.tqdm(metapaths):
    row_ids, col_ids, dwpc_matrix = hetmech.degree_weight.dwpc(hetmat, metapath, damping=damping, dense_threshold=0.7, dtype='float64')
    path = hetmat.get_path_counts_path(metapath, 'dwpc', damping, None)
    if not path.exists():
        hetmech.hetmat.save_matrix(dwpc_matrix, path)
    stats = metapath_to_stats[metapath]
    stats[f'dwpc-{damping}_raw_mean'] = dwpc_matrix.mean()
    del dwpc_matrix

100%|██████████| 2205/2205 [5:53:43<00:00,  9.62s/it]  


In [11]:
print(hetmat.path_counts_cache.get_stats())
hetmat.path_counts_cache = None

PathCountPriorityCache containing 103 items
  total gets: 2,205
  cache hits: memory = 0, disk = 2,205, absent = 0
  15.33 GB in use of 16.00 GB allocated


In [12]:
metapath_df = pandas.DataFrame(list(metapath_to_stats.values()))
metapath_df.to_csv('metapath-dwpc-stats.tsv', sep='\t', index=False, float_format='%.6g')
metapath_df.head(2)

Unnamed: 0,metapath,length,n_pairs,pc_density,pc_mean,pc_max,dwpc-0.5_raw_mean
0,AlD,1,55074,0.065403,0.065403,1,0.003746
1,AdG,1,8419890,0.012143,0.012143,1,7.8e-05


### Generate running DGP metrics

In [13]:
# Attempt to diagnose memory leak in https://github.com/greenelab/hetmech/issues/141
import tracemalloc
tracemalloc.start()
snapshots = [tracemalloc.take_snapshot()]

In [14]:
for name, permat in tqdm.tqdm(hetmat.permutations.items()):
    permat.path_counts_cache = hetmech.hetmat.caching.PathCountPriorityCache(permat, allocate_GB=16)
    for metapath in metapaths:
        dwpc_mean = metapath_to_stats[metapath][f'dwpc-{damping}_raw_mean']
        degree_grouped_df = hetmech.degree_group.single_permutation_degree_group(
            permat, metapath, dwpc_mean=dwpc_mean, damping=damping)
        path = hetmat.get_running_degree_group_path(metapath, 'dwpc', damping, extension='.pkl')
        path.parent.mkdir(parents=True, exist_ok=True)
        if path.exists():
            running_df = pandas.read_pickle(path)
            running_df += degree_grouped_df
        else:
            running_df = degree_grouped_df
        running_df.to_pickle(path)
    permat.clear_caches()
    snapshots.append(tracemalloc.take_snapshot())

100%|██████████| 200/200 [360:43:33<00:00, 6493.07s/it]    


In [15]:
# https://docs.python.org/3/library/tracemalloc.html
malloc_stats = snapshots[-1].compare_to(snapshots[0], 'lineno')
for malloc_stat in malloc_stats[:6]:
    print(malloc_stat)

/home/dhimmel/anaconda3/envs/hetmech/lib/python3.6/pathlib.py:69: size=5133 KiB (+5133 KiB), count=201 (+201), average=25.5 KiB
/home/dhimmel/anaconda3/envs/hetmech/lib/python3.6/site-packages/pandas/io/pickle.py:166: size=153 KiB (+153 KiB), count=361 (+361), average=435 B
/home/dhimmel/anaconda3/envs/hetmech/lib/python3.6/pathlib.py:666: size=109 KiB (+109 KiB), count=1001 (+1001), average=112 B
/home/dhimmel/Documents/greene/hetmech/hetmech/degree_group.py:58: size=99.5 KiB (+99.5 KiB), count=186 (+186), average=548 B
/home/dhimmel/anaconda3/envs/hetmech/lib/python3.6/site-packages/pandas/core/indexes/multi.py:2683: size=97.6 KiB (+97.6 KiB), count=194 (+194), average=515 B
/home/dhimmel/anaconda3/envs/hetmech/lib/python3.6/pathlib.py:52: size=74.8 KiB (+74.8 KiB), count=1197 (+1197), average=64 B


In [16]:
# Replace .pkl files with .tsv.gz files.
for metapath in metapaths:
    old_path = hetmat.get_running_degree_group_path(metapath, 'dwpc', damping, extension='.pkl')
    df = pandas.read_pickle(old_path)
    new_path = hetmat.get_running_degree_group_path(metapath, 'dwpc', damping, extension='.tsv.gz')
    df.to_csv(new_path, sep='\t', compression='gzip')
    old_path.unlink()

### Combine DWPC with DGP & calculate p-values

In [None]:
for metapath in tqdm.tqdm(metapaths):
    dwpcs_rows = hetmech.pipeline.combine_dwpc_dgp(hetmat, metapath, damping=damping, ignore_zeros=True, max_p_value=0.01)
    path = hetmat.directory.joinpath('adjusted-path-counts', f'dwpc-{damping}',
                                     'adjusted-dwpcs', f'{metapath}-filtered.tsv.gz')
    path.parent.mkdir(parents=True, exist_ok=True)
    hetmech.pipeline.grouped_tsv_writer(dwpcs_rows, path, float_format='%.7g', compression='gzip')

## Create archive

In [17]:
length_to_metapaths = dict()
for metapath in metapaths:
    metapath = hetmat.metagraph.get_metapath(metapath)
    length_to_metapaths.setdefault(len(metapath), []).append(metapath)
# Number of metapaths for each length
{k: len(v) for k, v in length_to_metapaths.items()}

{1: 24, 2: 242, 3: 1939}

In [18]:
# Configure archive options
split_size = None

In [19]:
if danger:
    # Delete existing archives
    archive_dir = pathlib.Path('archives')
    if archive_dir.exists():
        shutil.rmtree('archives')
    archive_dir.mkdir()

# Separate archives for each metapath length
for length, metapath_group in length_to_metapaths.items():
    # Create degree grouped archive
    degree_grouped_paths = [
        hetmat.get_running_degree_group_path(metapath, 'dwpc', damping, extension='.tsv.gz').relative_to(hetmat.directory)
        for metapath in metapath_group
    ]
    create_archive(
        destination_path=f'archives/degree-grouped-perms_length-{length}_damping-{damping}.zip',
        root_directory=hetmat.directory,
        source_paths=degree_grouped_paths,
        compression=zipfile.ZIP_STORED,
        split_size=split_size,
    )
    # Create PC archive
    pc_globs = [
        str(hetmat.get_path_counts_path(metapath, 'dwpc', 0, None).relative_to(hetmat.directory)) + '.*'
        for metapath in metapath_group
    ]
    create_archive_by_globs(
        destination_path=f'archives/dwpcs_length-{length}_damping-0.0.zip',
        root_directory=hetmat.directory,
        include_globs=pc_globs,
        compression=zipfile.ZIP_STORED,
        split_size=split_size,
    )
    # Create DWPC archive
    dwpc_globs = [
        str(hetmat.get_path_counts_path(metapath, 'dwpc', damping, None).relative_to(hetmat.directory)) + '.*'
        for metapath in metapath_group
    ]
    create_archive_by_globs(
        destination_path=f'archives/dwpcs_length-{length}_damping-{damping}.zip',
        root_directory=hetmat.directory,
        include_globs=dwpc_globs,
        compression=zipfile.ZIP_STORED,
        split_size=split_size,
    )

In [21]:
archive_dir = pathlib.Path('archives')

dfs = list()
for path in sorted(archive_dir.glob('dwpcs_*zip-info.tsv')):
    dfs.append(pandas.read_table(path))
info_df = pandas.concat(dfs)
info_df.to_csv('archives/dwpcs.zip-info.tsv', sep='\t', index=False)

dfs = list()
for path in sorted(archive_dir.glob('degree-grouped-perms_*zip-info.tsv')):
    dfs.append(pandas.read_table(path))
info_df = pandas.concat(dfs)
info_df.to_csv('archives/degree-grouped-perms.zip-info.tsv', sep='\t', index=False)

In [22]:
# Move metapath-dwpc-stats.tsv to archives
mv metapath-dwpc-stats.tsv archives/metapath-dwpc-stats.tsv