## Perform all necessary imports

In [6]:
from hetmech.matrix import *
from hetmech.degree_weight import *

In [7]:
import json
import urllib.request
import numpy as np
import hetio.readwrite
import hetio.hetnet
import pandas as pd
import time

## Load the graph

In [8]:
%%time
url = 'https://github.com/dhimmel/hetionet/raw/76550e6c93fbe92124edc71725e8c7dd4ca8b1f5/hetnet/json/hetionet-v1.0.json.bz2'
graph = hetio.readwrite.read_graph(url)
metagraph = graph.metagraph

CPU times: user 1min 7s, sys: 1.32 s, total: 1min 9s
Wall time: 1min 12s


## Load metapaths

In [9]:
with urllib.request.urlopen('https://github.com/dhimmel/learn/raw/25093893ed53730ab5cfdac49561c4b6bd3376c5/all-features/data/metapaths.json') as data_file:
    metapaths = json.loads(data_file.read().decode())

metapaths.sort(key=lambda x: x['join_complexities'][0])

len(metapaths)

1206

In [10]:
# Extract the actual metapaths from the list of metapath dictionaries
abbrevs = [metapath['abbreviation'] for metapath in metapaths]

## Get a list of the metapaths that are incompatible with DWPC

In [11]:
incompatible_metapaths = []

for metapath in abbrevs:
    mess = ''
    try:
        segments, duplicates = get_segments(graph.metagraph, metagraph.metapath_from_abbrev(metapath))
    except ValueError:
        mess = 'Incompatible'
        incompatible_metapaths.append(metapath)
'{} / {} were incompatible'.format(len(incompatible_metapaths), len(abbrevs))

'454 / 1206 were incompatible'

In [12]:
compatible_metapaths = [i for i in abbrevs if i not in incompatible_metapaths]
compatible_metapaths[0], len(compatible_metapaths), len(abbrevs)-len(compatible_metapaths)

('CiPCiCpD', 752, 454)

## Ensure that all DWPC matrices are indexed identically
This is necessary so that we can just use indices to sort dwpcs into vectors instead of searching by row and column.

In [8]:
rows, columns, mat = dwpc(graph, metagraph.metapath_from_abbrev('CiPCiCpD'), sparse_threshold=0.25)
exp_rows = rows
exp_cols = columns

In [25]:
%%time

for metapath in compatible_metapaths:
    rows, columns, mat = dwpc(graph, metagraph.metapath_from_abbrev(metapath), sparse_threshold=0.25)
    if (rows != exp_rows) or (columns != exp_cols):
        print(metapath)

CPU times: user 41min 42s, sys: 1min 12s, total: 42min 54s
Wall time: 42min 54s


The above cell shows that all the dwpc matrices will have the same rows and columns, meaning that the row/col combinations will be the same. So we will be able to sort the array into a vector the same way we did combinations of compound and disease names.

## Run through all DWPC metapaths -> table of compound/disease vs metapath_dwpc

In [24]:
%%time

compounds, diseases, mat = dwpc(graph, metagraph.metapath_from_abbrev(compatible_metapaths[0]))

# Generate a DataFrame with the proper compound/disease combinations
df = []
for compound in compounds:
    for disease in diseases:
        df.append([compound, disease])
df = pd.DataFrame(df, columns=('compounds', 'diseases'))
df.head()

# This will take many hours, probably > 7.
n = 0
for metapath in list(reversed(compatible_metapaths)):
    start = time.time()
    compounds, diseases, mat = dwpc(graph, metagraph.metapath_from_abbrev(metapath), sparse_threshold=0)
    df[metapath] = mat.flatten()
    tim = time.time() - start
    print("{:d} -- {:.2f} sec".format((n), (tim)))
    n += 1

0 -- 17.08 sec
1 -- 17.77 sec
2 -- 16.77 sec
3 -- 16.82 sec
4 -- 16.88 sec
5 -- 17.39 sec
6 -- 16.95 sec
7 -- 16.82 sec
8 -- 16.73 sec
9 -- 15.63 sec
10 -- 18.69 sec
11 -- 19.13 sec
12 -- 16.38 sec
13 -- 15.57 sec
14 -- 15.44 sec
15 -- 15.76 sec
16 -- 16.59 sec
17 -- 13.93 sec
18 -- 13.89 sec
19 -- 13.97 sec
20 -- 13.79 sec
21 -- 13.91 sec
22 -- 13.73 sec
23 -- 15.38 sec
24 -- 15.71 sec
25 -- 15.26 sec
26 -- 15.56 sec
27 -- 15.52 sec
28 -- 16.73 sec
29 -- 15.72 sec
30 -- 15.24 sec
31 -- 15.28 sec
32 -- 16.02 sec
33 -- 15.20 sec
34 -- 15.49 sec
35 -- 15.18 sec
36 -- 15.45 sec
37 -- 15.61 sec
38 -- 15.25 sec
39 -- 15.63 sec
40 -- 16.31 sec
41 -- 15.75 sec
42 -- 15.33 sec
43 -- 2.17 sec
44 -- 2.11 sec
45 -- 14.01 sec
46 -- 2.16 sec
47 -- 2.10 sec
48 -- 13.86 sec
49 -- 13.74 sec
50 -- 13.45 sec
51 -- 1.93 sec
52 -- 15.31 sec
53 -- 15.77 sec
54 -- 15.42 sec
55 -- 16.02 sec
56 -- 17.09 sec
57 -- 15.65 sec
58 -- 15.63 sec
59 -- 15.64 sec
60 -- 13.60 sec
61 -- 1.92 sec
62 -- 2.17 sec
63 -- 2.0

489 -- 123.43 sec
490 -- 122.60 sec
491 -- 12.48 sec
492 -- 122.76 sec
493 -- 125.14 sec
494 -- 123.06 sec
495 -- 124.32 sec
496 -- 1.13 sec
497 -- 1.14 sec
498 -- 1.01 sec
499 -- 1.03 sec
500 -- 127.26 sec
501 -- 0.96 sec
502 -- 125.47 sec
503 -- 0.94 sec
504 -- 127.00 sec
505 -- 125.37 sec
506 -- 126.08 sec
507 -- 0.51 sec
508 -- 0.40 sec
509 -- 0.85 sec
510 -- 0.85 sec
511 -- 0.84 sec
512 -- 0.84 sec
513 -- 13.29 sec
514 -- 13.04 sec
515 -- 0.28 sec
516 -- 0.78 sec
517 -- 0.79 sec
518 -- 0.78 sec
519 -- 0.79 sec
520 -- 0.78 sec
521 -- 0.78 sec
522 -- 0.78 sec
523 -- 0.78 sec
524 -- 12.57 sec
525 -- 12.49 sec
526 -- 0.23 sec
527 -- 12.97 sec
528 -- 0.85 sec
529 -- 0.18 sec
530 -- 0.80 sec
531 -- 0.79 sec
532 -- 13.55 sec
533 -- 13.00 sec
534 -- 12.68 sec
535 -- 126.15 sec
536 -- 127.12 sec
537 -- 13.02 sec
538 -- 12.79 sec
539 -- 0.18 sec
540 -- 0.82 sec
541 -- 0.83 sec
542 -- 0.83 sec
543 -- 0.83 sec
544 -- 0.78 sec
545 -- 0.78 sec
546 -- 0.78 sec
547 -- 0.78 sec
548 -- 0.78 sec
549

In [25]:
df.head()

Unnamed: 0,compounds,diseases,CdGeAeGaD,CuGeAeGaD,CdGeAeGuD,CdGeAeGdD,CbGeAeGaD,CuGeAeGuD,CuGeAeGdD,CbGeAeGuD,...,CrCpD,CbGbCtD,CpDrD,CdGbCpD,CbGdCpD,CuGbCpD,CbGuCpD,CbGbCpD,CiPCiCtD,CiPCiCpD
0,DB00014,DOID:0050156,0.0,1.7e-05,0.0,0.0,7.5e-05,9.4e-05,0.000104,0.000421,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,DB00014,DOID:0050425,0.0,1.5e-05,0.0,0.0,8.6e-05,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,DB00014,DOID:0050741,0.0,2.1e-05,0.0,0.0,0.000131,9e-05,8.1e-05,0.000476,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,DB00014,DOID:0050742,0.0,1.2e-05,0.0,0.0,6.1e-05,0.000102,0.0001,0.000369,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,DB00014,DOID:0060073,0.0,1.4e-05,0.0,0.0,5.3e-05,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
df.to_csv(open('/home/zietz/Documents/hetmech/data/dwpc_metapaths.tsv', 'w'), sep='\t')