# Generate Project Rephetio DWPCs using hetmech's matrix implementation

In [1]:
import collections
import itertools
import pathlib

import hetio.readwrite
import numpy
import pandas
import requests
import scipy.sparse
import tqdm

import hetmech.degree_weight

## Load Hetionet v1.0

In [2]:
%%time
commit = '59c448fd912555f84b9822b4f49b431b696aea15'
url = f'https://github.com/dhimmel/hetionet/raw/{commit}/hetnet/json/hetionet-v1.0.json.bz2'
graph = hetio.readwrite.read_graph(url)
metagraph = graph.metagraph

CPU times: user 1min 26s, sys: 2.3 s, total: 1min 28s
Wall time: 1min 30s


## Load Project Rephetio metapaths

In [3]:
commit = 'ef5f7a6b76b6a01499d65b95e3d7ca93ac5aba57'
url = f'https://github.com/dhimmel/learn/raw/{commit}/all-features/data/metapaths.json'
metapath_info = requests.get(url).json()
len(metapath_info)

1206

## Categorize metapaths

In [4]:
for info in metapath_info:
    metapath = metagraph.metapath_from_abbrev(info['abbreviation'])
    info['metapath'] = metapath
    info['category'] = hetmech.degree_weight.categorize(metapath)

metapath_df = pandas.DataFrame(metapath_info)
metapath_df.head(2)

Unnamed: 0,abbreviation,category,dwpc_query,edge_abbreviations,edges,join_complexities,length,metapath,midpoint_index,optimal_join_index,standard_edge_abbreviations,standard_edges
0,CbGaD,no_repeats,MATCH path = (n0:Compound)-[:BINDS_CbG]-(n1)-[...,"[CbG, GaD]","[Compound - binds - Gene, Gene - associates - ...","[1.706732543946901, 1.9982341667905623, 0.6525...",2,"(Compound - binds - Gene, Gene - associates - ...",1,2,"[CbG, DaG]","[Compound - binds - Gene, Disease - associates..."
1,CbGdD,no_repeats,MATCH path = (n0:Compound)-[:BINDS_CbG]-(n1)-[...,"[CbG, GdD]","[Compound - binds - Gene, Gene - downregulates...","[1.487695881502091, 1.8000147471813792, 0.4335...",2,"(Compound - binds - Gene, Gene - downregulates...",1,2,"[CbG, DdG]","[Compound - binds - Gene, Disease - downregula..."


In [5]:
len(metapath_info)

1206

In [6]:
metapath_df.category.value_counts()

short_repeat    599
BABA            278
BAAB            144
disjoint        131
other            32
no_repeats       18
long_repeat       4
Name: category, dtype: int64

In [7]:
metapath_df.category.value_counts(normalize=True).map('{:.1%}'.format)

short_repeat    49.7%
BABA            23.1%
BAAB            11.9%
disjoint        10.9%
other            2.7%
no_repeats       1.5%
long_repeat      0.3%
Name: category, dtype: object

## Compute DWPCs

In [8]:
exclude_categories = {'long_repeat', 'other'}
for info in tqdm.tqdm_notebook(metapath_info):
    if info['category'] in exclude_categories:
        continue
    metapath = info['metapath']
    compounds, diseases, dwpc_matrix, seconds = hetmech.degree_weight.dwpc(
        graph, metapath, damping=0.4, dense_threshold=1)
    info['dwpc_matrix'] = dwpc_matrix
    info['dwpc_hetmech_runtime'] = seconds




In [9]:
runtime_df = pandas.DataFrame(metapath_info)[['abbreviation', 'category', 'length', 'dwpc_hetmech_runtime']]
path = pathlib.Path('data/rephetio-DWPCs-hetmech-runtime.tsv')
runtime_df.to_csv(path, sep='\t', float_format='%.5g', index=False)
runtime_df.head(2)

Unnamed: 0,abbreviation,category,length,dwpc_hetmech_runtime
0,CbGaD,no_repeats,2,0.771515
1,CbGdD,no_repeats,2,0.729444


### Save DWPC matrices as outputs in Compound-Disease pairs

In [10]:
pairs = numpy.array(list(itertools.product(compounds, diseases)))
columns = collections.OrderedDict({
    'compound': pairs[:, 0],
    'disease': pairs[:, 1],
})

for info in metapath_info:
    if 'dwpc_matrix' not in info:
        continue
    matrix = info['dwpc_matrix']
    if scipy.sparse.issparse(matrix):
        matrix = matrix.toarray()
    assert isinstance(matrix, numpy.ndarray)
    columns[info['abbreviation']] = matrix.flatten(order='C').tolist()

dwpc_df = pandas.DataFrame.from_dict(columns)
dwpc_df.head(2)

Unnamed: 0,compound,disease,CbGaD,CbGdD,CbGuD,CdGaD,CdGdD,CdGuD,CpDrD,CrCpD,...,CuGuDlAlD,CuGuDpCpD,CuGuDpCtD,CuGuDpSpD,CuGuDrDrD,CuGuDtCpD,CuGuDtCtD,CuGuDuGaD,CuGuDuGdD,CuGuDuGuD
0,DB00014,DOID:0050156,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.002885,0.0,0.000216,0.000662,0.001791
1,DB00014,DOID:0050425,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00023,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
path = pathlib.Path('data/rephetio-DWPCs-hetmech.tsv.xz')
dwpc_df.to_csv(path, sep='\t', float_format='%.5g', index=False, compression='xz')