In [3]:
import hetmech.degree_weight
import hetmech.degree_group
from hetmech.hetmat import HetMat
import numpy 
import pandas
import scipy.sparse
import itertools
from hetmech.matrix import metaedge_to_adjacency_matrix

In [5]:
hetmat = HetMat('../data/hetionet-v1.0.hetmat/')

In [6]:
metapath = 'CbGaD'

In [39]:
def dwpc_to_degrees(graph, metapath, damping=0.5):
    metapath = graph.metagraph.get_metapath(metapath)
    _, _, source_adj_mat = metaedge_to_adjacency_matrix(graph, metapath[0], dense_threshold=0.7)
    _, _, target_adj_mat = metaedge_to_adjacency_matrix(graph, metapath[-1], dense_threshold=0.7)
    source_degrees = source_adj_mat.sum(axis=1).flat
    target_degrees = target_adj_mat.sum(axis=0).flat
    del source_adj_mat, target_adj_mat

    source_path = graph.get_nodes_path(metapath.source(), file_format='tsv')
    source_node_df = pandas.read_table(source_path)
    source_node_names = list(source_node_df['name'])

    target_path = graph.get_nodes_path(metapath.target(), file_format='tsv')
    target_node_df = pandas.read_table(target_path)
    target_node_names = list(target_node_df['name'])

    row_names, col_names, dwpc_matrix = graph.read_path_counts(metapath, 'dwpc', damping)
    dwpc_matrix = numpy.arcsinh(dwpc_matrix / dwpc_matrix.mean())
    if scipy.sparse.issparse(dwpc_matrix):
        dwpc_matrix = dwpc_matrix.toarray()

    _, _, path_count = graph.read_path_counts(metapath, 'dwpc', 0.0)
    if scipy.sparse.issparse(path_count):
        path_count = path_count.toarray()

    row_inds, col_inds = range(len(row_names)), range(len(col_names))
    for row in itertools.product(row_inds, col_inds):
        row_ind, col_ind = row
        row = {
            'source_id': row_names[row_ind],
            'source_name': source_node_names[row_ind],
            'target_name': target_node_names[col_ind],
            'target_id': col_names[col_ind],
            'source_degree': source_degrees[row_ind],
            'target_degree': target_degrees[col_ind],
            'dwpc': dwpc_matrix[row_ind, col_ind],
            'path-count': path_count[row_ind, col_ind],
        }
        yield row
        continue

In [40]:
%%time
pandas.DataFrame(dwpc_to_degrees(hetmat, metapath))

CPU times: user 634 ms, sys: 35.5 ms, total: 669 ms
Wall time: 669 ms


Unnamed: 0,dwpc,path-count,source_degree,source_id,source_name,target_degree,target_id,target_name
0,0.000000,0,2,DB00014,Goserelin,18,DOID:0050156,idiopathic pulmonary fibrosis
1,0.000000,0,2,DB00014,Goserelin,12,DOID:0050425,restless legs syndrome
2,0.000000,0,2,DB00014,Goserelin,49,DOID:0050741,alcohol dependence
3,0.000000,0,2,DB00014,Goserelin,19,DOID:0050742,nicotine dependence
4,0.000000,0,2,DB00014,Goserelin,9,DOID:0060073,lymphatic system cancer
5,0.000000,0,2,DB00014,Goserelin,16,DOID:0060119,pharynx cancer
6,0.000000,0,2,DB00014,Goserelin,4,DOID:10021,duodenum cancer
7,0.000000,0,2,DB00014,Goserelin,0,DOID:10153,ileum cancer
8,0.000000,0,2,DB00014,Goserelin,23,DOID:1024,leprosy
9,0.000000,0,2,DB00014,Goserelin,535,DOID:10283,prostate cancer


In [36]:
def dwpc_to_degrees(graph, metapath, damping=0.5):
    metapath = graph.metagraph.get_metapath(metapath)
    _, _, source_adj_mat = metaedge_to_adjacency_matrix(graph, metapath[0], dense_threshold=0.7)
    _, _, target_adj_mat = metaedge_to_adjacency_matrix(graph, metapath[-1], dense_threshold=0.7)
    source_degrees = source_adj_mat.sum(axis=1).flat
    target_degrees = target_adj_mat.sum(axis=0).flat
    del source_adj_mat, target_adj_mat

    source_path = graph.get_nodes_path(metapath.source(), file_format='tsv')
    source_node_df = pandas.read_table(source_path)
    source_node_names = list(source_node_df['name'])

    target_path = graph.get_nodes_path(metapath.target(), file_format='tsv')
    target_node_df = pandas.read_table(target_path)
    target_node_names = list(target_node_df['name'])

    row_names, col_names, dwpc_matrix = graph.read_path_counts(metapath, 'dwpc', damping)
    dwpc_matrix = numpy.arcsinh(dwpc_matrix / dwpc_matrix.mean())
    dwpc_matrix = scipy.sparse.coo_matrix(dwpc_matrix)

    _, _, path_count = graph.read_path_counts(metapath, 'dwpc', 0.0)
    path_count = scipy.sparse.coo_matrix(dwpc_matrix)

    indices = range(dwpc_matrix.nnz)
    for ind in indices:
        row_ind = dwpc_matrix.row[ind]
        col_ind = dwpc_matrix.col[ind]
        row = {
            'source_id': row_names[row_ind],
            'source_name': source_node_names[row_ind],
            'target_name': target_node_names[col_ind],
            'target_id': col_names[col_ind],
            'source_degree': source_degrees[row_ind],
            'target_degree': target_degrees[col_ind],
            'dwpc': dwpc_matrix.data[ind],
            'path-count': path_count.data[ind],
        }
        yield row
        continue

In [38]:
%%time
pandas.DataFrame(dwpc_to_degrees(hetmat, metapath))

CPU times: user 156 ms, sys: 4.22 ms, total: 160 ms
Wall time: 159 ms


Unnamed: 0,dwpc,path-count,source_degree,source_id,source_name,target_degree,target_id,target_name
0,4.967856,4.967856,2,DB00014,Goserelin,200,DOID:11612,polycystic ovary syndrome
1,5.248894,5.248894,2,DB00014,Goserelin,38,DOID:3953,adrenal gland cancer
2,0.695913,0.695913,5,DB00035,Desmopressin,535,DOID:10283,prostate cancer
3,0.888112,0.888112,5,DB00035,Desmopressin,298,DOID:10534,stomach cancer
4,2.032033,2.032033,5,DB00035,Desmopressin,196,DOID:10652,Alzheimer's disease
5,4.198928,4.198928,5,DB00035,Desmopressin,496,DOID:10763,hypertension
6,1.240646,1.240646,5,DB00035,Desmopressin,121,DOID:11054,urinary bladder cancer
7,1.590008,1.590008,5,DB00035,Desmopressin,55,DOID:11934,head and neck cancer
8,3.457301,3.457301,5,DB00035,Desmopressin,253,DOID:12849,autistic disorder
9,0.948889,0.948889,5,DB00035,Desmopressin,252,DOID:1324,lung cancer
