# Degree-Grouped Permutations
In permuted networks, those node pairs having the same out- and in-degrees along the metapath are indistinguishable. This means that when computing permuted DWPC values, we can average not just over the five permutations for each, say compound-disease pair, but can average over all compound-disease pairs that have the same out- and in-degrees along the metapath.

In [1]:
import numpy as np
from scipy import sparse
import pandas as pd
import itertools
import hetio.readwrite

In [2]:
import sys
sys.path.insert(0, '../')
from hetmech.degree_weight import dwpc
from hetio.matrix import metaedge_to_adjacency_matrix

In [3]:
repo_url = 'https://github.com/dhimmel/hetionet'
commit = '6d26d15e9055b33b4fd97a180fa288e4f2060b96'
names = ['hetionet-v1.0'] + [f'hetionet-v1.0-perm-{i + 1}' for i in range(5)]    
paths = ['hetnet/json/hetionet-v1.0.json.bz2'] + [
    f'hetnet/permuted/json/{name}.json.bz2' for name in names[1:]
]

In [4]:
metapaths = None
permuted = pd.DataFrame()
dwpc_df = pd.DataFrame()

for name, path in zip(names, paths):
    url = f'{repo_url}/raw/{commit}/{path}'
    graph = hetio.readwrite.read_graph(url)
    print(f'Loaded {name}')
    if not metapaths:
        metaedge = graph.metagraph.metapath_from_abbrev('CtD')
        metapaths = graph.metagraph.extract_metapaths(source=metaedge.source(), target=metaedge.target(), 
                                                      max_length=1)
    for metapath in metapaths:
        c, d, dwpc_matrix, t = dwpc(graph, metapath)
        print(f'metapath: {metapath}, time: {t}')
        
        out_degrees = np.sum(metaedge_to_adjacency_matrix(graph, metapath[0])[2], axis=1)
        in_degrees = np.sum(metaedge_to_adjacency_matrix(graph, metapath[-1])[2], axis=0)
        degree_combos = list(itertools.product(out_degrees, in_degrees))
        indices = list(itertools.product(range(len(out_degrees)), range(len(in_degrees))))
        index_df = pd.DataFrame(indices, columns=['row', 'col'])
        degree_combo_df = pd.DataFrame(degree_combos, columns=['source_degree', 'target_degree'])
        degree_index_df = index_df.join(degree_combo_df)
                
        # Add DWPC values
        degree_index_df['dwpc'] = degree_index_df.apply(lambda row: dwpc_matrix[row[0], row[1]], axis=1)
        degree_index_df['metapath'] = pd.Series([metapath] * len(degree_index_df))
        degree_index_df.drop(columns=['row', 'col'])
        if 'perm' in name:
            permuted = permuted.append(degree_index_df)
        else:
            dwpc_df = dwpc_df.append(degree_index_df)

Loaded hetionet-v1.0
metapath: CpD, time: 0.09279719501500949
metapath: CtD, time: 0.0936714849958662
Loaded hetionet-v1.0-perm-1
metapath: CpD, time: 0.09495542000513524
metapath: CtD, time: 0.09458129398990422
Loaded hetionet-v1.0-perm-2
metapath: CpD, time: 0.09932387797744013
metapath: CtD, time: 0.10067291100858711
Loaded hetionet-v1.0-perm-3
metapath: CpD, time: 0.10086868598591536
metapath: CtD, time: 0.10146672301925719
Loaded hetionet-v1.0-perm-4
metapath: CpD, time: 0.10212302999570966
metapath: CtD, time: 0.10411549799027853
Loaded hetionet-v1.0-perm-5
metapath: CpD, time: 0.1031207269988954
metapath: CtD, time: 0.10360637202393264


In [5]:
permuted.head(2)

Unnamed: 0,row,col,source_degree,target_degree,dwpc,metapath
0,0,0,0,1,0.0,(Compound - palliates - Disease)
1,0,1,0,10,0.0,(Compound - palliates - Disease)


## A single metapath:

In [6]:
permutation_df = permuted[permuted['metapath'] == metapaths[1]]

In [7]:
full_dwpc_df = dwpc_df[dwpc_df['metapath'] == metapaths[1]]

In [8]:
degree_grouped_permutations = pd.DataFrame(permutation_df
                                           .groupby(['source_degree', 'target_degree'])['dwpc']
                                           .apply(list)).reset_index()

In [9]:
degree_grouped_permutations.head(2)

Unnamed: 0,source_degree,target_degree,dwpc
0,0,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,0,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [10]:
full_dwpc_df = full_dwpc_df.merge(degree_grouped_permutations, on=['source_degree', 'target_degree'])

In [11]:
full_dwpc_df.head(2)

Unnamed: 0,row,col,source_degree,target_degree,dwpc_x,metapath,dwpc_y
0,0,0,2,0,0.0,(Compound - treats - Disease),"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,0,1,2,0,0.0,(Compound - treats - Disease),"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [12]:
full_dwpc_df = full_dwpc_df[(full_dwpc_df['source_degree'] != 0) & (full_dwpc_df['target_degree'] != 0)]

In [13]:
full_dwpc_df['p-dwpc'] = full_dwpc_df['dwpc_y'].apply(np.mean)

In [14]:
full_dwpc_df = full_dwpc_df.rename(columns={'dwpc_x': 'dwpc', 'dwpc_y': 'permutations'})
full_dwpc_df['r-dwpc'] = full_dwpc_df['dwpc'] - full_dwpc_df['p-dwpc']

In [15]:
full_dwpc_df['sd-dwpc'] = full_dwpc_df['permutations'].apply(np.std)

In [16]:
full_dwpc_df['z-dwpc'] = full_dwpc_df['r-dwpc'] / full_dwpc_df['sd-dwpc']
full_dwpc_df['z-dwpc'] = full_dwpc_df['z-dwpc'].fillna(0)

In [17]:
full_dwpc_df.head(2)

Unnamed: 0,row,col,source_degree,target_degree,dwpc,metapath,permutations,p-dwpc,r-dwpc,sd-dwpc,z-dwpc
4260,0,2,2,4,0.0,(Compound - treats - Disease),"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.003984,-0.003984,0.037317,-0.106752
4261,0,12,2,4,0.0,(Compound - treats - Disease),"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.003984,-0.003984,0.037317,-0.106752


In [18]:
full_dwpc_df['permutations'].apply(lambda x: len(x) / 5).hist(bins=50)

<matplotlib.axes._subplots.AxesSubplot at 0x7f7d51959a90>

In [19]:
permutation_df = permutation_df[(permutation_df['source_degree'] != 0) & (permutation_df['target_degree'] != 0)]

In [20]:
compare_df = permutation_df.groupby(['row', 'col'])['dwpc'].apply(list).reset_index()

In [21]:
compare_df['original p-dwpc'] = compare_df['dwpc'].apply(np.mean)
compare_df = compare_df.merge(full_dwpc_df, on=['row', 'col']).rename(columns={'dwpc_x': 'original perm', 
                                                                               'dwpc_y': 'dwpc'})

In [22]:
compare_df.head(2)

Unnamed: 0,row,col,original perm,original p-dwpc,source_degree,target_degree,dwpc,metapath,permutations,p-dwpc,r-dwpc,sd-dwpc,z-dwpc
0,0,2,"[0.0, 0.0, 0.0, 0.0, 0.0]",0.0,2,4,0.0,(Compound - treats - Disease),"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.003984,-0.003984,0.037317,-0.106752
1,0,3,"[0.0, 0.0, 0.0, 0.0, 0.0]",0.0,2,1,0.0,(Compound - treats - Disease),"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.001565,-0.001565,0.033229,-0.047098


In [23]:
len(compare_df)

29799

In [24]:
# Those for which DGP makes P-DWPC nonzero
compare_df[(compare_df['original p-dwpc'] == 0) & (compare_df['p-dwpc'] != 0)].shape

(26553, 13)

In [25]:
# Those which stay zero
compare_df[(compare_df['original p-dwpc'] == 0) & (compare_df['p-dwpc'] == 0)].shape

(177, 13)

In [26]:
(23 - 9411) / 9411

-0.9975560514291786

In [27]:
compare_df[compare_df['z-dwpc'] == np.inf]

Unnamed: 0,row,col,original perm,original p-dwpc,source_degree,target_degree,dwpc,metapath,permutations,p-dwpc,r-dwpc,sd-dwpc,z-dwpc
2736,148,78,"[0.0, 0.0, 0.0, 0.0, 0.0]",0.0,5,51,0.062622,(Compound - treats - Disease),"[0.0, 0.0, 0.0, 0.0, 0.0]",0.0,0.062622,0.0,inf
2741,148,85,"[0.0, 0.0, 0.0, 0.0, 0.0]",0.0,5,2,0.316228,(Compound - treats - Disease),"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,0.316228,0.0,inf
6905,320,94,"[0.0, 0.0, 0.0, 0.0, 0.0]",0.0,14,3,0.154303,(Compound - treats - Disease),"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,0.154303,0.0,inf
6906,320,95,"[0.0, 0.0, 0.0, 0.0, 0.0]",0.0,14,7,0.101015,(Compound - treats - Disease),"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,0.101015,0.0,inf
8349,384,66,"[0.0, 0.0, 0.0, 0.0, 0.0]",0.0,8,5,0.158114,(Compound - treats - Disease),"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,0.158114,0.0,inf
10543,484,128,"[0.0, 0.0, 0.0, 0.0, 0.0]",0.0,12,13,0.080064,(Compound - treats - Disease),"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,0.080064,0.0,inf
16213,719,81,"[0.0, 0.0, 0.0, 0.0, 0.0]",0.0,13,37,0.045596,(Compound - treats - Disease),"[0.0, 0.0, 0.0, 0.0, 0.0]",0.0,0.045596,0.0,inf
18051,812,66,"[0.0, 0.0, 0.0, 0.0, 0.0]",0.0,9,5,0.149071,(Compound - treats - Disease),"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,0.149071,0.0,inf
18055,812,73,"[0.0, 0.0, 0.0, 0.0, 0.0]",0.0,9,2,0.235702,(Compound - treats - Disease),"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,0.235702,0.0,inf
18166,813,128,"[0.0, 0.0, 0.0, 0.0, 0.0]",0.0,10,13,0.087706,(Compound - treats - Disease),"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,0.087706,0.0,inf


# Conclusion
So, using the degree-grouped permutations, we reduce the number of P-DWPC zeros by over 99.7%. This means that only a very few paths still have an issue with infinite Z-DWPC. To be clear, when DWPC = 0 and P-DWPC = 0, Z-DWPC is NaN, which I think we can easily call zero (as I have done here). The difficult case is when DWPC != 0 but P-DWPC = 0. This is the group that has been most reduced by using degree-grouped permutations.