In [1]:
import sys
import time

import deepwalk.graph
import ndex
import numpy as np
import pandas as od

from ddot import Ontology
from gensim.models import Word2Vec
from IPython.core.display import HTML
from scipy.spatial.distance import cosine
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm_notebook as tqdm

In [2]:
print(time.asctime())

Wed Oct 31 12:43:56 2018


In [3]:
print(sys.version)

3.7.0 (default, Jul 23 2018, 20:22:55) 
[Clang 9.1.0 (clang-902.0.39.2)]


## Get Network from NDEx

In [4]:
ndex_client = ndex.client.Ndex()
ndex_client

<ndex.client.Ndex at 0x11d6533c8>

This example uses PCNet. See: http://www.ndexbio.org/#/network/f93f402c-86d4-11e7-a10d-0ac135e8bacf

In [5]:
network_id = 'f93f402c-86d4-11e7-a10d-0ac135e8bacf'

In [6]:
HTML(ndex_client.get_network_summary(network_id)['description'])

In [7]:
%%time

res = ndex_client.get_network_as_cx_stream(network_id)
res_json = res.json()

CPU times: user 4.55 s, sys: 1.5 s, total: 6.05 s
Wall time: 28.5 s


## Convert to DeepWalk Graph

In [8]:
G = deepwalk.graph.Graph()
node_data = {}
node_labels = {}

for entry in tqdm(res_json, desc='entry', leave=True):
    for aspect, data in tqdm(entry.items(), desc='aspect', leave=False):       
        if aspect == 'nodes':
            for node in tqdm(data, desc='nodes', leave=False):
                node_data[node['@id']] = node
                node_labels[node['@id']] = node['n']

        if aspect == 'edges':
            for edge in tqdm(data, desc='edges', leave=False):
                G[edge['s']].append(edge['t'])
                
for node in G:
    G[node] = list(sorted(set(G[node])))

HBox(children=(IntProgress(value=0, description='entry', max=10, style=ProgressStyle(description_width='initia…

HBox(children=(IntProgress(value=0, description='aspect', max=1, style=ProgressStyle(description_width='initia…

HBox(children=(IntProgress(value=0, description='aspect', max=1, style=ProgressStyle(description_width='initia…

HBox(children=(IntProgress(value=0, description='aspect', max=1, style=ProgressStyle(description_width='initia…

HBox(children=(IntProgress(value=0, description='aspect', max=1, style=ProgressStyle(description_width='initia…

HBox(children=(IntProgress(value=0, description='aspect', max=1, style=ProgressStyle(description_width='initia…

HBox(children=(IntProgress(value=0, description='aspect', max=1, style=ProgressStyle(description_width='initia…

HBox(children=(IntProgress(value=0, description='aspect', max=1, style=ProgressStyle(description_width='initia…

HBox(children=(IntProgress(value=0, description='aspect', max=1, style=ProgressStyle(description_width='initia…

HBox(children=(IntProgress(value=0, description='edges', max=2724724, style=ProgressStyle(description_width='i…

HBox(children=(IntProgress(value=0, description='aspect', max=1, style=ProgressStyle(description_width='initia…

HBox(children=(IntProgress(value=0, description='nodes', max=19781, style=ProgressStyle(description_width='ini…

HBox(children=(IntProgress(value=0, description='aspect', max=1, style=ProgressStyle(description_width='initia…




In [9]:
for node in list(node_labels):
    if node not in graph:
        del node_labels[node]

NameError: name 'graph' is not defined

In [None]:
assert all(node in node_labels for node in G)
assert all(node in G for node in node_labels)

In [None]:
labels = [node_labels[i] for i in range(len(node_labels))]
labels[0]

## Run DeepWalk

Here are the constants used:

In [None]:
NUM_PATHS = 10
PATH_LENGTH = 80
SIZE = 128
WINDOW = 4

### Generate Walks

In [None]:
%%time

walks = deepwalk.graph.build_deepwalk_corpus(
    G, 
    num_paths=NUM_PATHS, 
    path_length=PATH_LENGTH,
)

### Use walks with Word2Vec

After they've been generated, they're available directly as a numpy matrix with `word2vec.wv.vectors`.

In [None]:
%%time

word2vec = Word2Vec(
    sentences=[list(map(str, walk)) for walk in walks],
    size=SIZE, 
    window=WINDOW, 
    min_count=0, 
    sg=1,
    workers=4,
)

In [None]:
word2vec.wv.save_word2vec_format(f'{network_id}.word2vec.txt')

## Generate Node Similarity Matrix

Use the cosine similarity between vectors in this space. To get distance, do 1 - cosine similarity.

In [None]:
len(word2vec.wv.vectors)

In [None]:
len(labels)

In [None]:
%%time

sim = cosine_similarity(word2vec.wv.vectors)

In [None]:
sim_df = pd.DataFrame(sim, index=labels, columns=labels)

## Run CLiXO

In [None]:
ont2 = Ontology.run_clixo(
    sim_df, 
    f'{network_id}_df_temp.txt', 
    f'{network_id}_clixo_temp.txt', 
    alpha=0.0, 
    beta=1.0, 
    square=True, 
    square_names=labels,
)