# Compute DWPCs for all metapaths

In [1]:
import pandas
import random
import tqdm

import hetmech.degree_weight
from hetmech.hetmat import HetMat
import hetmech.degree_group

In [2]:
hetmat = HetMat('../data/hetionet-v1.0.hetmat/')
hetmech.degree_weight.default_dwwc_method = hetmech.degree_weight.dwwc_recursive

All metapaths up to length 3 saving PC and DWPC matrices on disk -- actual hetnet.

In [3]:
metapaths = hetmat.metagraph.extract_all_metapaths(3, exclude_inverts=True)
# random.seed(1)
# metapaths = random.sample(metapaths, k=2)
len(metapaths)

2205

In [4]:
categories = [[str(metapath), hetmech.degree_weight.categorize(metapath)] for metapath in metapaths]
df = pandas.DataFrame(categories)

In [7]:
import hetio

In [9]:
url = 'https://github.com/dhimmel/hetio/raw/{}/{}'.format(
        '9dc747b8fc4e23ef3437829ffde4d047f2e1bdde',
        'test/data/disease-gene-example-graph.json',
    )
graph = hetio.readwrite.read_graph(url)
metapath = graph.metagraph.metapath_from_abbrev('GiGiGiG')

In [13]:
_, _, mat = hetmech.degree_weight.dwpc(graph, metapath, dense_threshold=1)

In [12]:
mat.toarray()

array([[ 0.00000000e+00,  0.00000000e+00, -1.38777878e-17,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  1.25000000e-01,  0.00000000e+00,
         0.00000000e+00],
       [-1.38777878e-17,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  1.25000000e-01,  0.00000000e+00,
         0.00000000e+00],
       [ 0.00000000e+00,  1.25000000e-01,  0.00000000e+00,
         1.25000000e-01,  0.00000000e+00,  0.00000000e+00,
         1.25000000e-01],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  1.25000000e-01,  0.00000000e+00,
         0.0000000

### Compute path counts

In [5]:
%%time
hetmat.path_counts_cache = hetmech.hetmat.PathCountPriorityCache(hetmat, allocate_GB=16)
for metapath in tqdm.tqdm(metapaths):
    row, col, dwpc_matrix = hetmech.degree_weight.dwpc(hetmat, metapath, damping=0, dense_threshold=1, dtype='uint64')
    path = hetmat.get_path_counts_path(metapath, 'dwpc', 0.0, None)
    hetmech.hetmat.save_matrix(dwpc_matrix, path)
    del dwpc_matrix

  4%|▍         | 94/2205 [00:50<18:43,  1.88it/s]

KeyboardInterrupt: 

### Compute DWPCs

In [None]:
%%time
hetmat.path_counts_cache = hetmech.hetmat.PathCountPriorityCache(hetmat, allocate_GB=16)
mean_dwpcs = dict()
for metapath in tqdm.tqdm(metapaths):
    # Compute degree-weighted path counts
    row, col, dwpc_matrix = hetmech.degree_weight.dwpc(hetmat, metapath, damping=0.5, dense_threshold=1)
    mean_dwpcs[(metapath, 'dwpc', 0.5)] = dwpc_matrix.mean()
    path = hetmat.get_path_counts_path(metapath, 'dwpc', 0.5, None)
    hetmech.hetmat.save_matrix(dwpc_matrix, path)
    del dwpc_matrix


  0%|          | 0/2205 [00:00<?, ?it/s][A
  0%|          | 1/2205 [00:00<06:45,  5.44it/s][A
  0%|          | 6/2205 [00:00<01:54, 19.28it/s][A
Exception in thread Thread-4:
Traceback (most recent call last):
  File "/home/michael/.conda/envs/hetmech/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/home/michael/.conda/envs/hetmech/lib/python3.6/site-packages/tqdm/_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "/home/michael/.conda/envs/hetmech/lib/python3.6/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration

  1%|          | 23/2205 [00:05<09:00,  4.04it/s]

In [None]:
hetmat.path_counts_cache = None

Read in matrices to get scaling factor (mean). Then compute all DGP for all permutations.

In [None]:
for name, permat in hetmat.permutations.items():
    permat.path_counts_cache = hetmech.hetmat.PathCountPriorityCache(permat, allocate_GB=16)
    for metapath in metapaths:
        _, _, matrix = hetmech.degree_weight.dwpc(permat, metapath, damping=0.5, dense_threshold=0.7)
        source_deg_to_ind, target_deg_to_ind = hetmech.degree_group.metapath_to_degree_dicts(permat, metapath)
        scaler = mean_dwpcs[(metapath, 'dwpc', 0.5)]
        row_generator = hetmech.degree_group.generate_degree_group_stats(
            source_deg_to_ind, target_deg_to_ind, matrix, scale=True, scaler=scaler)
        degree_stats_df = (
            pandas.DataFrame(row_generator)
            .set_index(['source_degree', 'target_degree'])
        )
        path = permat.directory.joinpath('degree-grouped-path-counts', 'dwpc-0.5', f'{metapath}.tsv')
        path.parent.mkdir(parents=True, exist_ok=True)
        degree_stats_df.to_csv(path, sep='\t')
    permat.path_counts_cache = None

Create multi-permutation DGP summary metrics.

In [None]:
for metapath in metapaths:
    degree_stats_df = None
    for permat in hetmat.permutations.values():
        path = permat.directory.joinpath('degree-grouped-path-counts', 'dwpc-0.5', f'{metapath}.tsv')
        df = (
            pandas.read_table(path)
            .set_index(['source_degree', 'target_degree'])
            .assign(n_perms=1)
        )
        if degree_stats_df is None:
            degree_stats_df = df
        else:
            degree_stats_df += df
    degree_stats_df = hetmech.degree_group.compute_summary_metrics(degree_stats_df)
    degree_stats_df.drop(columns=['sum', 'sum_of_squares'], inplace=True)
    path = hetmat.directory.joinpath('adjusted-path-counts', 'dwpc-0.5', 'degree-grouped-permutations', f'{metapath}.tsv')
    path.parent.mkdir(parents=True, exist_ok=True)
    degree_stats_df.to_csv(path, sep='\t')