# Compute DWPCs for all metapaths

In [19]:
%load_ext autoreload
%autoreload 2

In [92]:
import numpy
import pandas
import pathlib
import random
import scipy.sparse
import tqdm

import hetmech.degree_weight
from hetmech.hetmat import HetMat
import hetmech.degree_group

In [2]:
hetmat = HetMat('../data/hetionet-v1.0.hetmat/')
hetmech.degree_weight.default_dwwc_method = hetmech.degree_weight.dwwc_recursive

All metapaths up to length 3 saving PC and DWPC matrices on disk -- actual hetnet.

In [3]:
metapaths = hetmat.metagraph.extract_metapaths('Disease', max_length=2)
# random.seed(1)
# metapaths = random.sample(metapaths, k=2)
len(metapaths)

88

In [4]:
def compute_save_dwpc(graph, metapath, damping=0.5, dense_threshold=1, dtype='float64', approx_ok=True):
    path = hetmat.get_path_counts_path(metapath, 'dwpc', damping, None)
    for inverse in (True, False):
        mp = metapath
        if inverse:
            mp = metapath.inverse
        for ext in ('.sparse.npz', '.npy'):
            path = pathlib.Path(str(path) + ext)
            if path.exists():
                try:
                    return graph.read_path_counts(mp, 'dwpc', damping)
                except BadZipfile:
                    continue
    row, col, dwpc_matrix = hetmech.degree_weight.dwpc(graph, metapath, damping=damping, 
                                                       dense_threshold=dense_threshold, dtype=dtype)
    path = hetmat.get_path_counts_path(metapath, 'dwpc', damping, None)
    hetmech.hetmat.save_matrix(dwpc_matrix, path)
    return row, col, dwpc_matrix

### Compute path counts

In [5]:
%%time
hetmat.path_counts_cache = hetmech.hetmat.PathCountPriorityCache(hetmat, allocate_GB=16)
for metapath in tqdm.tqdm(metapaths):
    row, col, dwpc_matrix = compute_save_dwpc(hetmat, metapath, damping=0, dtype='uint64')
    del dwpc_matrix

100%|██████████| 88/88 [00:00<00:00, 120.05it/s]

CPU times: user 692 ms, sys: 39.4 ms, total: 731 ms
Wall time: 736 ms





### Compute DWPCs

In [6]:
%%time
hetmat.path_counts_cache = hetmech.hetmat.PathCountPriorityCache(hetmat, allocate_GB=16)
mean_dwpcs = dict()
for metapath in tqdm.tqdm(metapaths):
    # Compute degree-weighted path counts
    row, col, dwpc_matrix = compute_save_dwpc(hetmat, metapath, damping=0.5)
    mean_dwpcs[(metapath, 'dwpc', 0.5)] = dwpc_matrix.mean()
    del dwpc_matrix

100%|██████████| 88/88 [00:01<00:00, 80.80it/s]

CPU times: user 1.04 s, sys: 47.2 ms, total: 1.09 s
Wall time: 1.09 s





In [7]:
hetmat.path_counts_cache = None

Read in matrices to get scaling factor (mean). Then compute all DGP for all permutations.

In [8]:
for name, permat in tqdm.tqdm(hetmat.permutations.items()):
    permat.path_counts_cache = hetmech.hetmat.PathCountPriorityCache(permat, allocate_GB=16)
    for metapath in metapaths:
        _, _, matrix = hetmech.degree_weight.dwpc(permat, metapath, damping=0.5, dense_threshold=0.7)
        source_deg_to_ind, target_deg_to_ind = hetmech.degree_group.metapath_to_degree_dicts(permat, metapath)
        scaler = mean_dwpcs[(metapath, 'dwpc', 0.5)]
        row_generator = hetmech.degree_group.generate_degree_group_stats(
            source_deg_to_ind, target_deg_to_ind, matrix, scale=True, scaler=scaler)
        degree_stats_df = (
            pandas.DataFrame(row_generator)
            .set_index(['source_degree', 'target_degree'])
        )
        path = permat.directory.joinpath('degree-grouped-path-counts', 'dwpc-0.5', f'{metapath}.tsv')
        path.parent.mkdir(parents=True, exist_ok=True)
        degree_stats_df.to_csv(path, sep='\t')
    permat.path_counts_cache = None

100%|██████████| 25/25 [10:04<00:00, 24.16s/it]


Create multi-permutation DGP summary metrics.

In [9]:
for metapath in metapaths:
    degree_stats_df = None
    for permat in hetmat.permutations.values():
        path = permat.directory.joinpath('degree-grouped-path-counts', 'dwpc-0.5', f'{metapath}.tsv')
        df = (
            pandas.read_table(path)
            .set_index(['source_degree', 'target_degree'])
            .assign(n_perms=1)
        )
        if degree_stats_df is None:
            degree_stats_df = df
        else:
            degree_stats_df += df
    degree_stats_df = hetmech.degree_group.compute_summary_metrics(degree_stats_df)
    degree_stats_df.drop(columns=['sum', 'sum_of_squares'], inplace=True)
    path = hetmat.directory.joinpath('adjusted-path-counts', 'dwpc-0.5', 'degree-grouped-permutations', f'{metapath}.tsv')
    path.parent.mkdir(parents=True, exist_ok=True)
    degree_stats_df.to_csv(path, sep='\t')

In [119]:
def dwpc_to_degrees(graph, metapath, matrix):
    
    metapath = graph.metagraph.get_metapath(metapath)
    row_names, _, source_adj_mat = hetmech.degree_weight.metaedge_to_adjacency_matrix(graph, metapath[0], dense_threshold=0.7)
    _, col_names, target_adj_mat = hetmech.degree_weight.metaedge_to_adjacency_matrix(graph, metapath[-1], dense_threshold=0.7)
    source_degrees = source_adj_mat.sum(axis=1).flat
    target_degrees = target_adj_mat.sum(axis=0).flat
    
    matrix = numpy.arcsinh(matrix / matrix.mean())
#     if scipy.sparse.issparse(dwpc_matrix)
#         dwpc_matrix = scipy.sparse.coo_matrix(matrix)
#         row_inds = dwpc_matrix.row
#         col_inds = dwpc_matrix.col
#         data = dwpc_matrix.data
#     else:
#         row_inds, col_inds = dwpc_matrix.nonzero()
    matrix = matrix.toarray()
    row_inds, col_inds = dwpc_matrix.nonzero()
    for row in zip(row_inds, col_inds):
        row_ind, col_ind = row
        row = {
            'source_id': row_names[row_ind],
            'target_id': col_names[col_ind],
            'source_degree': source_degrees[row_ind],
            'target_degree': target_degrees[col_ind],
            'dwpc': matrix[row_ind, col_ind],
            'metapath': metapath,
            'source_metanode': metapath.source(),
            'target_metanode': metapath.target(),
        }
        yield row
        continue

In [120]:
full_dwpc_df = None
for metapath in metapaths:
    stats_path = hetmat.directory.joinpath('adjusted-path-counts', 'dwpc-0.5', 'degree-grouped-permutations', f'{mp}.tsv')
    degree_stats_df = pandas.read_table(stats_path)
    
    row, col, dwpc_matrix = hetmat.read_path_counts(metapath, 'dwpc', 0.5)
    dwpc_row_generator = dwpc_to_degrees(hetmat, metapath, dwpc_matrix)
    dwpc_df = pandas.DataFrame(dwpc_row_generator)
    df = (dwpc_df
     .merge(degree_stats_df, on=['source_degree', 'target_degree'])
     .drop(columns=['source_degree', 'target_degree'])
     .rename(columns={'mean': 'p-dwpc', 'sd': 'sd-dwpc'}))
    if full_dwpc_df is None:
        full_dwpc_df = df
    else:
        full_dwpc_df = full_dwpc_df.append(df, ignore_index=True)
full_dwpc_df['r-dwpc'] = full_dwpc_df['dwpc'] - full_dwpc_df['p-dwpc']
full_dwpc_df['z-dwpc'] = full_dwpc_df['r-dwpc'] / full_dwpc_df['sd-dwpc']

In [121]:
full_dwpc_df.head(2)

Unnamed: 0,dwpc,metapath,source_id,source_metanode,target_id,target_metanode,n,nnz,n_perms,p-dwpc,sd-dwpc,r-dwpc,z-dwpc
0,5.049958,(Disease - resembles - Disease),DOID:0050156,Disease,DOID:3083,Disease,3025,7,25,0.011686,0.242685,5.038272,20.760547
1,5.049958,(Disease - resembles - Disease),DOID:3083,Disease,DOID:0050156,Disease,3025,7,25,0.011686,0.242685,5.038272,20.760547


In [122]:
full_dwpc_df[full_dwpc_df['source_id'] == 'DOID:2377'].sort_values(by='z-dwpc', ascending=False)

Unnamed: 0,dwpc,metapath,source_id,source_metanode,target_id,target_metanode,n,nnz,n_perms,p-dwpc,sd-dwpc,r-dwpc,z-dwpc
1332195,2.865957,"(Disease - palliates - Compound, Compound - ca...",DOID:2377,Disease,C0221100,Side Effect,25,0,25,0.000000,0.000000,2.865957,inf
1332192,2.233810,"(Disease - palliates - Compound, Compound - ca...",DOID:2377,Disease,C0028077,Side Effect,25,0,25,0.000000,0.000000,2.233810,inf
1339992,7.568769,"(Disease - palliates - Compound, Compound - do...",DOID:2377,Disease,6382,Gene,25,0,25,0.000000,0.000000,7.568769,inf
1337884,6.285036,"(Disease - palliates - Compound, Compound - up...",DOID:2377,Disease,6721,Gene,25,0,25,0.000000,0.000000,6.285036,inf
1337883,6.285036,"(Disease - palliates - Compound, Compound - up...",DOID:2377,Disease,3949,Gene,25,0,25,0.000000,0.000000,6.285036,inf
1332201,3.910672,"(Disease - palliates - Compound, Compound - ca...",DOID:2377,Disease,C1328337,Side Effect,25,0,25,0.000000,0.000000,3.910672,inf
1332200,2.998104,"(Disease - palliates - Compound, Compound - ca...",DOID:2377,Disease,C0919715,Side Effect,25,0,25,0.000000,0.000000,2.998104,inf
1332199,2.998104,"(Disease - palliates - Compound, Compound - ca...",DOID:2377,Disease,C0856054,Side Effect,25,0,25,0.000000,0.000000,2.998104,inf
1332198,2.330735,"(Disease - palliates - Compound, Compound - ca...",DOID:2377,Disease,C0522153,Side Effect,25,0,25,0.000000,0.000000,2.330735,inf
1332197,3.929708,"(Disease - palliates - Compound, Compound - ca...",DOID:2377,Disease,C0278106,Side Effect,25,0,25,0.000000,0.000000,3.929708,inf
