This notebook outlines a short workflow for using DeepWalk (a random walk-based representation learning algorithm) for generating node similarities based on a given network from NDEx, then using the CLiXO algorithm to generate an ontology. 

In [1]:
import itertools as itt
import json
import sys
import time

import deepwalk.graph
import ndex
import numpy as np
import pandas as pd

from ddot import Ontology
from gensim.models import Word2Vec
from IPython.core.display import HTML
from scipy.spatial.distance import cosine
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm_notebook as tqdm

In [2]:
print(time.asctime())

Thu Nov 15 16:40:55 2018


In [3]:
print(sys.version)

3.7.0 (default, Jul 23 2018, 20:22:55) 
[Clang 9.1.0 (clang-902.0.39.2)]


## Get Network from NDEx

In [4]:
ndex_client = ndex.client.Ndex()
ndex_client

<ndex.client.Ndex at 0x118fd6a58>

This example uses PCNet. See: http://www.ndexbio.org/#/network/f93f402c-86d4-11e7-a10d-0ac135e8bacf

In [5]:
network_id = 'f93f402c-86d4-11e7-a10d-0ac135e8bacf'

In [6]:
HTML(ndex_client.get_network_summary(network_id)['description'])

In [7]:
%%time
res = ndex_client.get_network_as_cx_stream(network_id)
res_json = res.json()

CPU times: user 4.45 s, sys: 1.68 s, total: 6.13 s
Wall time: 47.6 s


In [8]:
with open(f'{network_id}.cx.json', 'w') as file:
    json.dump(res_json, file, indent=2)

## Representation Learning of Verticies

### Convert to DeepWalk Graph

In [9]:
G = deepwalk.graph.Graph()
node_data = {}
node_labels = {}
nodes_added = set()


for entry in tqdm(res_json, desc='entry', leave=True):
    for aspect, data in tqdm(entry.items(), desc='aspect', leave=False):       
        if aspect == 'nodes':
            for node in tqdm(data, desc='nodes', leave=False):
                node_data[str(node['@id'])] = node
                node_labels[str(node['@id'])] = node['n']

        if aspect == 'edges':
            for edge in tqdm(data, desc='edges', leave=False):
                nodes_added.add(edge['s'])
                nodes_added.add(edge['t'])
                G[str(edge['s'])].append(str(edge['t']))

for node in G:
    G[node] = list(sorted(set(G[node])))

HBox(children=(IntProgress(value=0, description='entry', max=10, style=ProgressStyle(description_width='initia…

HBox(children=(IntProgress(value=0, description='aspect', max=1, style=ProgressStyle(description_width='initia…

HBox(children=(IntProgress(value=0, description='aspect', max=1, style=ProgressStyle(description_width='initia…

HBox(children=(IntProgress(value=0, description='aspect', max=1, style=ProgressStyle(description_width='initia…

HBox(children=(IntProgress(value=0, description='aspect', max=1, style=ProgressStyle(description_width='initia…

HBox(children=(IntProgress(value=0, description='aspect', max=1, style=ProgressStyle(description_width='initia…

HBox(children=(IntProgress(value=0, description='aspect', max=1, style=ProgressStyle(description_width='initia…

HBox(children=(IntProgress(value=0, description='aspect', max=1, style=ProgressStyle(description_width='initia…

HBox(children=(IntProgress(value=0, description='aspect', max=1, style=ProgressStyle(description_width='initia…

HBox(children=(IntProgress(value=0, description='edges', max=2724724, style=ProgressStyle(description_width='i…

HBox(children=(IntProgress(value=0, description='aspect', max=1, style=ProgressStyle(description_width='initia…

HBox(children=(IntProgress(value=0, description='nodes', max=19781, style=ProgressStyle(description_width='ini…

HBox(children=(IntProgress(value=0, description='aspect', max=1, style=ProgressStyle(description_width='initia…




In [10]:
targets = set(itt.chain.from_iterable(G.values()))

print(f'G has {len(G)} source nodes, '
      f'{len(targets)} target nodes, '
      f'{len(set(G) | targets)} total nodes, and '
      f'{sum(len(v) for v in G.values())} edges')

G has 19080 source nodes, 19116 target nodes, 19781 total nodes, and 2724724 edges


In [11]:
len(G.nodes())

19781

In [12]:
node_data['0']

{'@id': 0, 'n': 'UBE2Q1', 'r': 'hgnc.symbol:UBE2Q1'}

### Run DeepWalk

Here are the constants used:

In [13]:
NUM_PATHS = 20
PATH_LENGTH = 80
SIZE = 128
WINDOW = 4

A "corpus" of random walks are generated. This terminology is used because they are ultimately fed into the word2vec algorithm, which was originally used for generating embedding vectors for words in a text mining scenario.

In [14]:
%%time

walks = deepwalk.graph.build_deepwalk_corpus(
    G, 
    num_paths=NUM_PATHS, 
    path_length=PATH_LENGTH,
)

CPU times: user 12.1 s, sys: 257 ms, total: 12.4 s
Wall time: 13 s


Check that all of the nodes appear in the walks.

In [15]:
assert not (set(node_labels) - set(itt.chain.from_iterable(walks)))

### Use walks with Word2Vec

After they've been generated, they're available directly as a numpy matrix with `word2vec.wv.vectors`.

In [16]:
%%time

word2vec = Word2Vec(
    sentences=[list(map(str, walk)) for walk in walks],
    size=SIZE, 
    window=WINDOW, 
    min_count=0, 
    sg=1,
    workers=4,
)

CPU times: user 1min 34s, sys: 835 ms, total: 1min 35s
Wall time: 27.2 s


In [17]:
word2vec.wv.save_word2vec_format(f'{network_id}.word2vec.txt')

### Generate Node Similarity Matrix

Use the cosine similarity between vectors in this space. To get distance, do 1 - cosine similarity.

In [24]:
%%time

sim = 1 - cosine_similarity(word2vec.wv.vectors)

CPU times: user 4.54 s, sys: 2.05 s, total: 6.59 s
Wall time: 5.84 s


In [25]:
labels = [node_labels[n] for n in word2vec.wv.index2word]
sim_df = pd.DataFrame(sim, index=labels, columns=labels)

In [29]:
sim_df.head()

Unnamed: 0,SERPINH1,LDHB,CFL1,LDHC,LDHA,GNGT1,SELP,PRPS1,NFIX,PTH1R,...,SEPP1,PAQR9,SGSM1,OR7A17,MAGEL2,SLC25A51,UBE2Q2,MCCD1,TTC27,MRPL41
SERPINH1,-1.192093e-07,0.3370851,0.364377,0.2504417,0.3458785,0.846963,0.590955,0.399605,0.580317,0.683794,...,0.738503,0.764865,0.664739,0.850983,0.61341,0.67029,0.590677,0.760782,0.676372,0.630641
LDHB,0.3370851,-1.192093e-07,0.267796,0.3300763,0.2105537,0.812056,0.800143,0.338855,0.613683,0.757142,...,0.786891,0.760254,0.626906,0.875428,0.60549,0.594739,0.568401,0.68874,0.60241,0.572164
CFL1,0.3643774,0.2677963,0.0,0.1550904,0.259937,0.797273,0.779284,0.336873,0.667381,0.772707,...,0.7672,0.750455,0.59366,0.869306,0.58092,0.625776,0.566126,0.74969,0.60795,0.60051
LDHC,0.2504417,0.3300763,0.15509,-2.384186e-07,0.3165914,0.775677,0.682521,0.423188,0.717771,0.716506,...,0.767041,0.755215,0.586659,0.784148,0.621067,0.605261,0.599655,0.649744,0.626946,0.59415
LDHA,0.3458785,0.2105537,0.259937,0.3165914,3.576279e-07,0.759368,0.779663,0.327824,0.585723,0.687397,...,0.764088,0.752084,0.654709,0.856935,0.601715,0.594422,0.616446,0.6873,0.598617,0.579241


## Run CLiXO

First, slice the similarity matrix by the target genes.

In [None]:
target_genes = [
    'SERPINH1', 
    'LDHB', 
    'LDHB',
]

In [32]:
# Only keep rows and columns for target genes
truncated_sim_df = sim_df.loc[target_genes,target_genes]

Unnamed: 0,SERPINH1,LDHB,LDHB.1
SERPINH1,-1.192093e-07,0.3370851,0.3370851
LDHB,0.3370851,-1.192093e-07,-1.192093e-07
LDHB,0.3370851,-1.192093e-07,-1.192093e-07


In [28]:
ont2 = Ontology.run_clixo(
    df=truncated_sim_df, 
    df_output_path=f'{network_id}_df_temp.txt', 
    clixo_output_path=f'{network_id}_clixo_temp.txt', 
    output_log_path=f'{network_id}_log.txt',
    alpha=0.0, 
    beta=1.0, 
    square=True, 
    square_names=labels,
)

KeyboardInterrupt: 