In [1]:
import json, pandas as pd, copy, functools, itertools, collections, operator, time, urllib.request
import concurrent.futures
import numpy as np
from scipy import sparse
import hetio.readwrite
from hetmech.degree_weight import *

## Load the graph

In [2]:
%%time
url = 'data/hetionet-v1.0.json'
graph = hetio.readwrite.read_graph(url)
metagraph = graph.metagraph

CPU times: user 58.7 s, sys: 1.25 s, total: 59.9 s
Wall time: 59.9 s


## Load metapaths

In [3]:
metapaths_url = 'https://raw.githubusercontent.com/dhimmel/learn/{}/all-features/data/metapaths.json'.format(
    'ef5f7a6b76b6a01499d65b95e3d7ca93ac5aba57')

with urllib.request.urlopen(metapaths_url) as data_file:
    metapaths = json.loads(data_file.read().decode())

metapaths.sort(key=lambda x: x['join_complexities'][0])

len(metapaths)

1206

# Extract the actual metapaths from the list of metapath dictionaries

In [4]:
abbrevs = [metapath['abbreviation'] for metapath in metapaths]

## Categorize the metapaths

In [5]:
types = []
for metapath in abbrevs:
    m_path = metagraph.metapath_from_abbrev(metapath)
    cat = categorize(m_path)
    types.append([metapath, cat])
types = pd.DataFrame(types, columns=("Metapath", "Category"))

In [6]:
types.head()

Unnamed: 0,Metapath,Category
0,CpDpCpD,BABA
1,CpDpCtD,BABA
2,CpDtCpD,BABA
3,CtDpCpD,BABA
4,CiPCiCpD,short_repeat


In [7]:
len(types[types.Category == 'BABA'])

278

In [8]:
frequency = []
for path_type in set(list(types.Category)):
    frequency.append([path_type, len(types[types.Category == path_type])])
frequency = pd.DataFrame(frequency, columns=('PathType', 'Number'))

In [9]:
frequency

Unnamed: 0,PathType,Number
0,other,32
1,long_repeat,4
2,disjoint,131
3,short_repeat,599
4,BAAB,144
5,no_repeats,18
6,BABA,278


## ALL but OTHER and Long_repeat

In [10]:
all_strings = types[(types['Category'] != 'other')]
all_strings = all_strings[all_strings['Category'] != 'long_repeat']
all_strings = list(all_strings.Metapath)

alls = [metagraph.metapath_from_abbrev(s) for s in all_strings]

In [11]:
%%time
all_arrays = {}
all_times = []
n=0
for i, m_path in enumerate(alls):
    row, col, mat, timed = dwpc(graph, m_path, damping=0.4, sparse_threshold=1)
    st = all_strings[i]
    print(f'{n}; metapath: {st}; time: {timed:.3}')
    all_arrays[st] = mat
    all_times.append([st, timed])
    n += 1

0; metapath: CpDpCpD; time: 0.228
1; metapath: CpDpCtD; time: 0.22
2; metapath: CpDtCpD; time: 0.218
3; metapath: CtDpCpD; time: 0.22
4; metapath: CiPCiCpD; time: 0.235
5; metapath: CpDtCtD; time: 0.215
6; metapath: CtDpCtD; time: 0.222
7; metapath: CtDtCpD; time: 0.214
8; metapath: CiPCiCtD; time: 0.242
9; metapath: CbGbCpD; time: 0.893
10; metapath: CtDtCtD; time: 0.221
11; metapath: CbGuCpD; time: 0.893
12; metapath: CuGbCpD; time: 0.891
13; metapath: CbGdCpD; time: 0.893
14; metapath: CdGbCpD; time: 1.07
15; metapath: CbGdDpCpD; time: 1.13
16; metapath: CpDdGbCpD; time: 0.975
17; metapath: CpDpCbGdD; time: 0.902
18; metapath: CbGuDpCpD; time: 0.903
19; metapath: CpDpCbGuD; time: 0.977
20; metapath: CpDuGbCpD; time: 0.896
21; metapath: CpDrD; time: 0.127
22; metapath: CbGbCtD; time: 0.883
23; metapath: CrCpD; time: 0.193
24; metapath: CbGdD; time: 0.783
25; metapath: CuGuCpD; time: 1.01
26; metapath: CbGuD; time: 0.754
27; metapath: CdGuCpD; time: 0.901
28; metapath: CuGdCpD; time: 

225; metapath: CbGbCdGdD; time: 4.7
226; metapath: CbGdCbGdD; time: 4.61
227; metapath: CdGbCbGdD; time: 4.64
228; metapath: CbGbCdGuD; time: 4.72
229; metapath: CbGdCbGuD; time: 4.64
230; metapath: CdGbCbGuD; time: 4.62
231; metapath: CbG<rGuCpD; time: 7.49
232; metapath: CbGr>GuCpD; time: 7.52
233; metapath: CuG<rGbCpD; time: 7.55
234; metapath: CuGr>GbCpD; time: 7.52
235; metapath: CbGdDrD; time: 0.782
236; metapath: CbGcGdCtD; time: 7.16
237; metapath: CdGcGbCtD; time: 7.3
238; metapath: CuGuCpDrD; time: 0.954
239; metapath: CbGuDrD; time: 0.825
240; metapath: CdGaDtCtD; time: 1.01
241; metapath: CtDaGdCtD; time: 1.11
242; metapath: CtDtCdGaD; time: 0.972
243; metapath: CrCbGdD; time: 0.864
244; metapath: CrCuGuCpD; time: 1.01
245; metapath: CuGuCrCpD; time: 1.02
246; metapath: CrCbGuD; time: 0.833
247; metapath: CbG<rGdCpD; time: 7.52
248; metapath: CbGr>GdCpD; time: 7.55
249; metapath: CdG<rGbCpD; time: 7.71
250; metapath: CdGr>GbCpD; time: 7.63
251; metapath: CbGiGuCpD; time: 7.

447; metapath: CdG<rGuD; time: 4.78
448; metapath: CdGr>GuD; time: 4.73
449; metapath: CuGiGuD; time: 4.82
450; metapath: CbGiGaD; time: 4.9
451; metapath: CpDdG<rGdD; time: 7.42
452; metapath: CpDdGr>GdD; time: 7.42
453; metapath: CdGaDrD; time: 0.831
454; metapath: CpDdG<rGuD; time: 7.47
455; metapath: CpDdGr>GuD; time: 7.44
456; metapath: CpDuG<rGdD; time: 7.45
457; metapath: CpDuGr>GdD; time: 7.48
458; metapath: CpDuG<rGuD; time: 7.45
459; metapath: CpDuGr>GuD; time: 7.45
460; metapath: CuG<rGuCtD; time: 7.58
461; metapath: CuGr>GuCtD; time: 7.54
462; metapath: CpDaGdDrD; time: 0.839
463; metapath: CpDdGaDrD; time: 0.835
464; metapath: CpDrDaGdD; time: 0.833
465; metapath: CpDrDdGaD; time: 0.833
466; metapath: CpDaGuDrD; time: 0.843
467; metapath: CpDrDaGuD; time: 0.834
468; metapath: CpDrDuGaD; time: 0.836
469; metapath: CpDuGaDrD; time: 0.866
470; metapath: CdGdCdGdD; time: 4.8
471; metapath: CrCdGaD; time: 0.893
472; metapath: CdGdCdGuD; time: 4.78
473; metapath: CrCpDaGdD; time

667; metapath: CrCuGcGdD; time: 4.5
668; metapath: CrCuGcGuD; time: 4.51
669; metapath: CrCbGcGaD; time: 4.48
670; metapath: CrCtDaGaD; time: 0.921
671; metapath: CbGcGiGdD; time: 11.2
672; metapath: CbGiGcGdD; time: 11.2
673; metapath: CdGcGdDrD; time: 4.44
674; metapath: CbGcGiGuD; time: 11.2
675; metapath: CbGiGcGuD; time: 11.2
676; metapath: CtDaGiGdD; time: 7.59
677; metapath: CtDdGiGaD; time: 7.56
678; metapath: CdGcGuDrD; time: 4.43
679; metapath: CtDaGiGuD; time: 7.58
680; metapath: CtDuGiGaD; time: 7.55
681; metapath: CrCdGcGdD; time: 4.49
682; metapath: CuGaDaGdD; time: 4.04
683; metapath: CuGaDdGaD; time: 4.17
684; metapath: CuGdDaGaD; time: 4.18
685; metapath: CrCdGcGuD; time: 4.51
686; metapath: CuGaDaGuD; time: 4.14
687; metapath: CuGaDuGaD; time: 4.16
688; metapath: CuGuDaGaD; time: 4.09
689; metapath: CbGaDaGaD; time: 4.25
690; metapath: CuGcGcGaD; time: 10.9
691; metapath: CrCtDrDrD; time: 0.312
692; metapath: CrCrCtDrD; time: 0.389
693; metapath: CbG<rGdDrD; time: 4.8

885; metapath: CdGpMFpGdD; time: 5.21
886; metapath: CdGpMFpGuD; time: 5.24
887; metapath: CdG<rG<rGdD; time: 12.1
888; metapath: CdG<rGr>GdD; time: 12.2
889; metapath: CdGr>G<rGdD; time: 12.4
890; metapath: CdGr>Gr>GdD; time: 11.9
891; metapath: CuG<rGiGdD; time: 12.2
892; metapath: CuGiG<rGdD; time: 11.8
893; metapath: CuGiGr>GdD; time: 11.8
894; metapath: CuGr>GiGdD; time: 11.9
895; metapath: CdG<rG<rGuD; time: 12.1
896; metapath: CdG<rGr>GuD; time: 12.2
897; metapath: CdGr>G<rGuD; time: 12.5
898; metapath: CdGr>Gr>GuD; time: 11.8
899; metapath: CuG<rGiGuD; time: 12.1
900; metapath: CuGiG<rGuD; time: 11.8
901; metapath: CuGiGr>GuD; time: 11.8
902; metapath: CuGr>GiGuD; time: 11.9
903; metapath: CbG<rGiGaD; time: 11.8
904; metapath: CbGiG<rGaD; time: 11.8
905; metapath: CbGiGr>GaD; time: 11.8
906; metapath: CbGr>GiGaD; time: 11.7
907; metapath: CuGpCCpGdD; time: 5.03
908; metapath: CuGpPWpGdD; time: 5.02
909; metapath: CdG<rGaDrD; time: 4.88
910; metapath: CdGr>GaDrD; time: 4.83
911;

1101; metapath: CtDlAeGaD; time: 2.33
1102; metapath: CdGdAdGaD; time: 10.3
1103; metapath: CcSEcCdGdD; time: 1.78
1104; metapath: CdGpBPpGaD; time: 9.49
1105; metapath: CcSEcCdGuD; time: 1.79
1106; metapath: CbGeAlDrD; time: 2.06
1107; metapath: CrCbGeAlD; time: 2.1
1108; metapath: CcSEcCuGaD; time: 1.81
1109; metapath: CuGcGeAlD; time: 5.7
1110; metapath: CbGeAuGdD; time: 12.6
1111; metapath: CbGuAeGdD; time: 15.3
1112; metapath: CbGeAuGuD; time: 12.6
1113; metapath: CbGuAeGuD; time: 15.1
1114; metapath: CbGdAeGdD; time: 15.0
1115; metapath: CbGeAdGdD; time: 12.4
1116; metapath: CbGdAeGuD; time: 15.1
1117; metapath: CbGeAdGuD; time: 12.4
1118; metapath: CcSEcCdGaD; time: 1.82
1119; metapath: CdGcGeAlD; time: 5.69
1120; metapath: CbG<rGeAlD; time: 6.05
1121; metapath: CbGr>GeAlD; time: 6.07
1122; metapath: CuGeAlDrD; time: 2.08
1123; metapath: CrCuGeAlD; time: 2.13
1124; metapath: CbGiGeAlD; time: 6.14
1125; metapath: CdGeAlDrD; time: 2.06
1126; metapath: CrCdGeAlD; time: 2.12
1127; m

In [12]:
np.save('data/all_times', all_times)
np.save('data/all_arrays', all_arrays)

#### Save DWPC times as a .tsv file.

In [13]:
times = np.load('data/all_times.npy')

times_df = pd.DataFrame(times, columns=('Metapath', 'Time'))

times_df.to_csv(path_or_buf='data/all_times.tsv', sep='\t', float_format='%.6g', index=False)

#### Save DWPC matrices as outputs in Compound-Disease pairs

In [14]:
arrs = np.load('data/all_arrays.npy')
arrs = arrs.tolist()

In [16]:
row, col, mat = metaedge_to_adjacency_matrix(graph, 'CpD')

In [17]:
comp_disease_pairs = numpy.array([[comp, disease] for comp in row for disease in col])

In [18]:
mat_dict = {'compound': comp_disease_pairs[:,0], 'disease': comp_disease_pairs[:,1]}

In [19]:
mat_dict = dict()
for meta, mat in arrs.items():
    if type(mat) == numpy.matrix:
        mat = mat.A
    elif sparse.issparse(mat):
        mat = mat.toarray()
    assert type(mat) == numpy.ndarray
    mat = mat.flatten(order='C').tolist()
    mat_dict[meta] = mat
mat_dict['disease'] = comp_disease_pairs[:,1]
mat_dict['compound'] = comp_disease_pairs[:,0]

In [20]:
dwpc_matrices = pd.DataFrame.from_dict(mat_dict)

In [21]:
colnames = list(reversed(list(dwpc_matrices)))
colnames = [colnames.pop(1)] + colnames

In [22]:
dwpc_matrices = dwpc_matrices.ix[:, colnames]
dwpc_matrices.head()

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  if __name__ == '__main__':


Unnamed: 0,compound,disease,CuGuDuGuD,CuGuDuGdD,CuGuDuGaD,CuGuDtCtD,CuGuDtCpD,CuGuDrDrD,CuGuDrD,CuGuDpSpD,...,CbG<rGcGuD,CbG<rGcGdD,CbG<rGcGaD,CbG<rGbCtD,CbG<rGbCpD,CbG<rGaDrD,CbG<rGaD,CbG<rG<rGuD,CbG<rG<rGdD,CbG<rG<rGaD
0,DB00014,DOID:0050156,0.001791,0.000662,0.000216,0.0,0.002885,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,DB00014,DOID:0050425,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,DB00014,DOID:0050741,0.00116,0.000211,0.0,0.0,0.0,0.0,0.0,0.000413,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,DB00014,DOID:0050742,0.001002,0.000816,0.0,0.0,0.00805,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,DB00014,DOID:0060073,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
%%time
dwpc_matrices.to_csv(path_or_buf='data/dwpc_data.tsv', sep='\t', float_format='%.6g', index=False)

CPU times: user 3min 58s, sys: 2.77 s, total: 4min 1s
Wall time: 4min 2s
