In [1]:
import json, pandas as pd, copy, functools, itertools, collections, operator, time, urllib.request
import concurrent.futures
import numpy as np
from scipy import sparse
import hetio.readwrite
from hetmech.degree_weight import *

## Load the graph

In [2]:
%%time

url = 'https://github.com/dhimmel/hetionet/raw/{}/hetnet/json/hetionet-v1.0.json.bz2'.format(
'59c448fd912555f84b9822b4f49b431b696aea15')

graph = hetio.readwrite.read_graph(url)
metagraph = graph.metagraph

CPU times: user 1min 19s, sys: 1.54 s, total: 1min 21s
Wall time: 1min 22s


## Load metapaths

In [3]:
metapaths_url = 'https://raw.githubusercontent.com/dhimmel/learn/{}/all-features/data/metapaths.json'.format(
    'ef5f7a6b76b6a01499d65b95e3d7ca93ac5aba57')

with urllib.request.urlopen(metapaths_url) as data_file:
    metapaths = json.loads(data_file.read().decode())

metapaths.sort(key=lambda x: x['join_complexities'][0])

len(metapaths)

1206

# Extract the actual metapaths from the list of metapath dictionaries

In [4]:
abbrevs = [metapath['abbreviation'] for metapath in metapaths]

## Categorize the metapaths

In [5]:
types = []
for metapath in abbrevs:
    m_path = metagraph.metapath_from_abbrev(metapath)
    cat = categorize(m_path)
    types.append([metapath, cat])
types = pd.DataFrame(types, columns=("Metapath", "Category"))

In [6]:
types.head()

Unnamed: 0,Metapath,Category
0,CpDpCpD,BABA
1,CpDpCtD,BABA
2,CpDtCpD,BABA
3,CtDpCpD,BABA
4,CiPCiCpD,short_repeat


In [7]:
types['Category'].value_counts()

short_repeat    599
BABA            278
BAAB            144
disjoint        131
other            32
no_repeats       18
long_repeat       4
Name: Category, dtype: int64

## ALL but other and Long_repeat

In [9]:
metapath_strings = types[types['Category'] != 'other']
metapath_strings = metapath_strings[metapath_strings['Category'] != 'long_repeat']
metapath_strings = list(metapath_strings.Metapath)

metapaths = [metagraph.metapath_from_abbrev(s) for s in metapath_strings]

In [10]:
%%time
dwpc_arrays = {}
metapath_times = []
n=0
for i, metapath in enumerate(metapaths):
    compounds, diseases, mat, timed = dwpc(graph, metapath, damping=0.4, sparse_threshold=1)
    st = metapath_strings[i]
    dwpc_arrays[st] = mat
    metapath_times.append([st, timed])

CPU times: user 46min 2s, sys: 23.7 s, total: 46min 26s
Wall time: 46min 26s


#### Save DWPC times as a .tsv file.

In [11]:
times_df = pd.DataFrame(metapath_times, columns=('Metapath', 'Time'))

times_df.to_csv(path_or_buf='data/dwpc_times.tsv', sep='\t', float_format='%.6g', index=False)

#### Save DWPC matrices as outputs in Compound-Disease pairs

In [12]:
compounds, diseases, mat = metaedge_to_adjacency_matrix(graph, 'CpD')

In [13]:
comp_disease_pairs = numpy.array([[comp, disease] for comp in compounds for disease in diseases])

In [14]:
mat_dict = {'compound': comp_disease_pairs[:,0], 'disease': comp_disease_pairs[:,1]}

In [15]:
mat_dict = dict()
for metaedge, matrix in dwpc_arrays.items():
    if type(matrix) == numpy.matrix:
        matrix = matrix.A
    elif sparse.issparse(matrix):
        matrix = matrix.toarray()
    assert type(matrix) == numpy.ndarray
    matrix = matrix.flatten(order='C').tolist()
    mat_dict[metaedge] = matrix
mat_dict['disease'] = comp_disease_pairs[:,1]
mat_dict['compound'] = comp_disease_pairs[:,0]

In [16]:
dwpc_matrices = pd.DataFrame.from_dict(mat_dict)

In [17]:
colnames = list(reversed(list(dwpc_matrices)))
colnames = [colnames.pop(1)] + colnames

In [18]:
dwpc_matrices = dwpc_matrices.ix[:, colnames]
dwpc_matrices.head()

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  if __name__ == '__main__':


Unnamed: 0,compound,disease,CuGuDuGuD,CuGuDuGdD,CuGuDuGaD,CuGuDtCtD,CuGuDtCpD,CuGuDrDrD,CuGuDrD,CuGuDpSpD,...,CbG<rGcGuD,CbG<rGcGdD,CbG<rGcGaD,CbG<rGbCtD,CbG<rGbCpD,CbG<rGaDrD,CbG<rGaD,CbG<rG<rGuD,CbG<rG<rGdD,CbG<rG<rGaD
0,DB00014,DOID:0050156,0.001791,0.000662,0.000216,0.0,0.002885,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,DB00014,DOID:0050425,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,DB00014,DOID:0050741,0.00116,0.000211,0.0,0.0,0.0,0.0,0.0,0.000413,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,DB00014,DOID:0050742,0.001002,0.000816,0.0,0.0,0.00805,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,DB00014,DOID:0060073,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
dwpc_matrices.shape

(212624, 1172)

In [19]:
%%time
dwpc_matrices.to_csv(path_or_buf='data/dwpc_data.tsv', sep='\t', float_format='%.6g', index=False)

CPU times: user 3min 47s, sys: 2.52 s, total: 3min 49s
Wall time: 3min 50s
