# Compute DWPCs for all metapaths

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy
import pandas
import pathlib
import random
import scipy.sparse
import tqdm

import hetmech.degree_weight
from hetmech.hetmat import HetMat
import hetmech.matrix
import hetmech.degree_group

In [3]:
hetmat = HetMat('../data/hetionet-v1.0.hetmat/')
hetmech.degree_weight.default_dwwc_method = hetmech.degree_weight.dwwc_recursive

All metapaths up to length 3 saving PC and DWPC matrices on disk -- actual hetnet.

In [4]:
metapaths = hetmat.metagraph.extract_metapaths('Disease', max_length=2)
# random.seed(1)
# metapaths = random.sample(metapaths, k=2)
len(metapaths)

88

In [4]:
def compute_save_dwpc(graph, metapath, damping=0.5, dense_threshold=1, dtype='float64', approx_ok=True):
    path = hetmat.get_path_counts_path(metapath, 'dwpc', damping, None)
    for inverse in (True, False):
        mp = metapath
        if inverse:
            mp = metapath.inverse
        for ext in ('.sparse.npz', '.npy'):
            path = pathlib.Path(str(path) + ext)
            if path.exists():
                try:
                    return graph.read_path_counts(mp, 'dwpc', damping)
                except BadZipfile:
                    continue
    row, col, dwpc_matrix = hetmech.degree_weight.dwpc(graph, metapath, damping=damping, 
                                                       dense_threshold=dense_threshold, dtype=dtype)
    path = hetmat.get_path_counts_path(metapath, 'dwpc', damping, None)
    hetmech.hetmat.save_matrix(dwpc_matrix, path)
    return row, col, dwpc_matrix

### Compute path counts

In [5]:
%%time
hetmat.path_counts_cache = hetmech.hetmat.PathCountPriorityCache(hetmat, allocate_GB=16)
for metapath in tqdm.tqdm(metapaths):
    row, col, dwpc_matrix = compute_save_dwpc(hetmat, metapath, damping=0, dtype='uint64')
    del dwpc_matrix

100%|██████████| 88/88 [00:00<00:00, 120.05it/s]

CPU times: user 692 ms, sys: 39.4 ms, total: 731 ms
Wall time: 736 ms





### Compute DWPCs

In [6]:
%%time
hetmat.path_counts_cache = hetmech.hetmat.PathCountPriorityCache(hetmat, allocate_GB=16)
mean_dwpcs = dict()
for metapath in tqdm.tqdm(metapaths):
    # Compute degree-weighted path counts
    row, col, dwpc_matrix = compute_save_dwpc(hetmat, metapath, damping=0.5)
    mean_dwpcs[(metapath, 'dwpc', 0.5)] = dwpc_matrix.mean()
    del dwpc_matrix

100%|██████████| 88/88 [00:01<00:00, 80.80it/s]

CPU times: user 1.04 s, sys: 47.2 ms, total: 1.09 s
Wall time: 1.09 s





In [7]:
hetmat.path_counts_cache = None

Read in matrices to get scaling factor (mean). Then compute all DGP for all permutations.

In [8]:
for name, permat in tqdm.tqdm(hetmat.permutations.items()):
    permat.path_counts_cache = hetmech.hetmat.PathCountPriorityCache(permat, allocate_GB=16)
    for metapath in metapaths:
        _, _, matrix = hetmech.degree_weight.dwpc(permat, metapath, damping=0.5, dense_threshold=0.7)
        source_deg_to_ind, target_deg_to_ind = hetmech.degree_group.metapath_to_degree_dicts(permat, metapath)
        scaler = mean_dwpcs[(metapath, 'dwpc', 0.5)]
        row_generator = hetmech.degree_group.generate_degree_group_stats(
            source_deg_to_ind, target_deg_to_ind, matrix, scale=True, scaler=scaler)
        degree_stats_df = (
            pandas.DataFrame(row_generator)
            .set_index(['source_degree', 'target_degree'])
        )
        path = permat.directory.joinpath('degree-grouped-path-counts', 'dwpc-0.5', f'{metapath}.tsv')
        path.parent.mkdir(parents=True, exist_ok=True)
        degree_stats_df.to_csv(path, sep='\t')
    permat.path_counts_cache = None

100%|██████████| 25/25 [10:04<00:00, 24.16s/it]


Create multi-permutation DGP summary metrics.

In [9]:
for metapath in metapaths:
    degree_stats_df = None
    for permat in hetmat.permutations.values():
        path = permat.directory.joinpath('degree-grouped-path-counts', 'dwpc-0.5', f'{metapath}.tsv')
        df = (
            pandas.read_table(path)
            .set_index(['source_degree', 'target_degree'])
            .assign(n_perms=1)
        )
        if degree_stats_df is None:
            degree_stats_df = df
        else:
            degree_stats_df += df
    degree_stats_df = hetmech.degree_group.compute_summary_metrics(degree_stats_df)
    degree_stats_df.drop(columns=['sum', 'sum_of_squares'], inplace=True)
    path = hetmat.directory.joinpath('adjusted-path-counts', 'dwpc-0.5', 'degree-grouped-permutations', f'{metapath}.tsv')
    path.parent.mkdir(parents=True, exist_ok=True)
    degree_stats_df.to_csv(path, sep='\t')

In [63]:
%%time

full_dwpc_df = None
for metapath in metapaths:
    stats_path = hetmat.directory.joinpath('adjusted-path-counts', 'dwpc-0.5', 'degree-grouped-permutations', 
                                           f'{metapath}.tsv')
    degree_stats_df = pandas.read_table(stats_path)

    dwpc_row_generator = hetmech.degree_group.dwpc_to_degrees(hetmat, metapath)
    dwpc_df = pandas.DataFrame(dwpc_row_generator)
    df = (dwpc_df
     .merge(degree_stats_df, on=['source_degree', 'target_degree'])
     .drop(columns=['source_degree', 'target_degree'])
     .rename(columns={'mean': 'p-dwpc', 'sd': 'sd-dwpc'}))
    if full_dwpc_df is None:
        full_dwpc_df = df
    else:
        full_dwpc_df = full_dwpc_df.append(df, ignore_index=True)
full_dwpc_df['r-dwpc'] = full_dwpc_df['dwpc'] - full_dwpc_df['p-dwpc']
full_dwpc_df['z-dwpc'] = full_dwpc_df['r-dwpc'] / full_dwpc_df['sd-dwpc']

CPU times: user 1min 38s, sys: 9.15 s, total: 1min 47s
Wall time: 1min 48s


In [64]:
full_dwpc_df.head(2)

Unnamed: 0,dwpc,metapath,source_id,source_metanode,source_name,target_id,target_metanode,target_name,n,nnz,n_perms,p-dwpc,sd-dwpc,r-dwpc,z-dwpc
0,8.475021,DpC,DOID:0050156,Disease,idiopathic pulmonary fibrosis,DB00860,Compound,Prednisolone,34000,80,25,0.019941,0.410621,8.45508,20.590957
1,8.475021,DpC,DOID:0050742,Disease,nicotine dependence,DB00184,Compound,Nicotine,34000,80,25,0.019941,0.410621,8.45508,20.590957


In [72]:
(full_dwpc_df[(full_dwpc_df['source_id'] == 'DOID:2377') & (full_dwpc_df['z-dwpc'] > 0)]
 .sort_values(by='z-dwpc', ascending=False)).head(10)

Unnamed: 0,dwpc,metapath,source_id,source_metanode,source_name,target_id,target_metanode,target_name,n,nnz,n_perms,p-dwpc,sd-dwpc,r-dwpc,z-dwpc
1011205,2.196126,DdGpPW,DOID:2377,Disease,multiple sclerosis,WP366_r84196,Pathway,TGF-beta Signaling Pathway,25,0,25,0.0,0.0,2.196126,inf
1096965,6.221435,DdGiG,DOID:2377,Disease,multiple sclerosis,23432,Gene,GPR161,62075,35,25,0.003571,0.151115,6.217864,41.146588
1096964,6.221435,DdGiG,DOID:2377,Disease,multiple sclerosis,9481,Gene,SLC25A27,62075,35,25,0.003571,0.151115,6.217864,41.146588
989824,8.520533,DdGcG,DOID:2377,Disease,multiple sclerosis,57047,Gene,PLSCR2,49575,70,25,0.009349,0.249816,8.511183,34.06985
8731,5.049958,DrD,DOID:2377,Disease,multiple sclerosis,DOID:332,Disease,amyotrophic lateral sclerosis,2200,2,25,0.004591,0.152227,5.045367,33.143634
989823,7.827385,DdGcG,DOID:2377,Disease,multiple sclerosis,50859,Gene,SPOCK3,49575,70,25,0.009349,0.249816,7.818036,31.295216
989825,7.201004,DdGcG,DOID:2377,Disease,multiple sclerosis,643836,Gene,ZFP62,49575,70,25,0.009349,0.249816,7.191655,28.787843
1096959,7.080257,DdGiG,DOID:2377,Disease,multiple sclerosis,9182,Gene,RASSF9,40075,72,25,0.010627,0.252248,7.06963,28.026466
1096961,6.854265,DdGiG,DOID:2377,Disease,multiple sclerosis,135250,Gene,RAET1E,40075,72,25,0.010627,0.252248,6.843638,27.130554
1096639,7.098441,DdGiG,DOID:2377,Disease,multiple sclerosis,81573,Gene,ANKRD13C,31575,85,25,0.015778,0.305905,7.082663,23.153134


In [69]:
full_dwpc_df.query("source_name == 'multiple sclerosis' and target_name == 'HLA-DRB1'").sort_values(by='z-dwpc', ascending=False)

Unnamed: 0,dwpc,metapath,source_id,source_metanode,source_name,target_id,target_metanode,target_name,n,nnz,n_perms,p-dwpc,sd-dwpc,r-dwpc,z-dwpc
5124318,4.264673,DaGiG,DOID:2377,Disease,multiple sclerosis,3123,Gene,HLA-DRB1,12375,1948,25,0.398492,0.99758,3.866181,3.875559
31388,5.146717,DaG,DOID:2377,Disease,multiple sclerosis,3123,Gene,HLA-DRB1,200,46,25,1.183745,2.171341,3.962972,1.825127
3680504,1.436285,DlAdG,DOID:2377,Disease,multiple sclerosis,3123,Gene,HLA-DRB1,141000,62797,25,0.779325,0.914459,0.65696,0.718414
2739061,1.33102,DlAeG,DOID:2377,Disease,multiple sclerosis,3123,Gene,HLA-DRB1,23900,23842,25,1.376573,0.421152,-0.045553,-0.108162


In [71]:
full_dwpc_df.query("source_name == 'multiple sclerosis' and target_name == 'Fingolimod'").sort_values(by='z-dwpc', ascending=False)

Unnamed: 0,dwpc,metapath,source_id,source_metanode,source_name,target_id,target_metanode,target_name,n,nnz,n_perms,p-dwpc,sd-dwpc,r-dwpc,z-dwpc
9408,6.859458,DtC,DOID:2377,Disease,multiple sclerosis,DB08868,Compound,Fingolimod,32250,478,25,0.101669,0.828902,6.757789,8.152703
4644999,4.653594,DaGbC,DOID:2377,Disease,multiple sclerosis,DB08868,Compound,Fingolimod,1225,506,25,0.727528,1.043149,3.926066,3.763668
