## Pathway Similarity Maps
This notebook implements a protocol for creating networks that are "similarity maps" of the networks in a set or the subsystem networks in a hierarchical model. 

When a set is the input to the protocol, the networks are assumed to be interaction networks in which node names are gene symbols and/or where nodes have an attribute that is a list of gene symbols.

When a hierarchical model is the input, each node must have an attribute that is a list of gene symbols

In each case, the input is processed to create a list of gene lists. A similarity score is calculated for each pair of gene lists, creating a similarity matrix. A network is created from the matrix, filtering to include only edges above a specified threshold score.

The network is then given a layout and a graphic style and saved to the user's account in NDEx.

## Setup: imports, NDEx user credentials, utilities.

In [220]:
from math import*
import numpy as np
import pandas as pd
import ndex2
import getpass
import requests
import io
import igraph
import networkx as nx
import math

from IPython.display import display, HTML

In [201]:
# Watch out for typos and be sure to hit enter in the field to set the value!!!
NDEXUSER = getpass.getpass()

········


In [202]:
# Watch out for typos and be sure to hit enter in the field to set the value!!!
NDEXPASSWORD = getpass.getpass()

········


In [203]:
SERVER="http://www.ndexbio.org"
#This is used to create clickable links to view the networks in NDEx
NETWORK_BASE_URL = 'http://ndexbio.org/viewer/networks/'
print(NDEXUSER)
print(NDEXPASSWORD)
ndx=ndex2.Ndex2(SERVER, NDEXUSER, NDEXPASSWORD)
ndx

dexterpratt
cytoscaperules


<ndex2.client.Ndex2 at 0x1fba241f880>

In [207]:
def describe_network_set(set):
    display(HTML("<h2>" + set.get("name") + "</h2>"))
    display(HTML(set.get("description")))
    for network_id in set.get("networks"):
        summary = ndx.get_network_summary(network_id)
        display(HTML("<h4>" + summary.get("name") + "</h4>"))
        if summary.get("description"):
            display(HTML(summary.get("description")))


## Part 1: Similarity Networks from Network Sets

### Select a network set to process

In [206]:
#example_set_id = "d0096935-4b05-11ec-b3be-0ac135e8bacf"
example_set_id = "1e6a6016-fc81-11ea-99da-0ac135e8bacf" # The 51 Signor curated pathway networks

example_set = ndx.get_network_set(example_set_id)
describe_network_set(example_set)

{'name': 'Signaling, Disease and Cancer Pathways',
 'description': '<p>This set contains  human pathways manually-curated from the literature by the SIGNOR Team at the University of Rome</p><p> <img src="https://signor.uniroma2.it/img/signor_logo.png" alt="Figure 1" width="200"/></p>',
 'ownerId': '0db1f2dc-103f-11e8-b939-0ac135e8bacf',
 'networks': ['02a8eede-97f1-11eb-9e72-0ac135e8bacf',
  '035163e3-97f1-11eb-9e72-0ac135e8bacf',
  '03dcfa67-1e5f-11e8-b939-0ac135e8bacf',
  '0522eea1-97f1-11eb-9e72-0ac135e8bacf',
  '077e15fd-1e5f-11e8-b939-0ac135e8bacf',
  '0934c6b0-1e5f-11e8-b939-0ac135e8bacf',
  '0ae84313-1e5f-11e8-b939-0ac135e8bacf',
  '0b6cbc27-97f1-11eb-9e72-0ac135e8bacf',
  '0ced3ad6-1e5f-11e8-b939-0ac135e8bacf',
  '0f019be9-1e5f-11e8-b939-0ac135e8bacf',
  '10f360ba-97f1-11eb-9e72-0ac135e8bacf',
  '126da4b2-97f1-11eb-9e72-0ac135e8bacf',
  '12c33ec7-97f1-11eb-9e72-0ac135e8bacf',
  '1af025eb-afe8-11e9-8bb4-0ac135e8bacf',
  '1af7be3b-1e5f-11e8-b939-0ac135e8bacf',
  '1b3873d6-8f9a-11

## Similarity Scores
- Jacquard
- Cosine

In [212]:
# Jacquard similarity for two gene lists:
# The ratio of size of the intersection to the size of the union
def jaccard_similarity(x,y):
 
 intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
 union_cardinality = len(set.union(*[set(x), set(y)]))
 return intersection_cardinality/float(union_cardinality)

# Cosine similarity for two gene lists:
# loop through each list, find distinct genes and map them to a
# unique number starting at zero
def make_all_genes(gene_lists):
    all_genes = {}
    i = 0
    for gene_list in gene_lists:
        for gene in gene_list:
            if gene not in all_genes:
                all_genes[gene] = i
                i += 1
    return all_genes

def cosine_similarity(list_a, list_b, all_genes):
    a = list2vec(list_a, all_genes)
    # print(a)
    b = list2vec(list_b, all_genes)
    # print(b)
    # use numpy's dot product to calculate the cosine similarity
    return np.dot(a, b) / np.sqrt(np.dot(a, a) * np.dot(b, b))
    
def list2vec(list, all_genes):
    # create a vector of zeros the length of all genes
    vec = np.zeros(len(all_genes))
    # for each gene in the list, get the index and set that element of the vector to 1
    for gene in list:
        index = all_genes[gene] # fragile, need to test to be sure all_genes contains gene
        vec[index] = 1
    return vec

def make_jaccard_similarity_matrix(gene_lists):
    size = len(gene_lists)
    matrix = np.zeros((size,size))
    for i in range(size):
        # print(i)
        for j in range(size):
            sim = jaccard_similarity(gene_lists[i], gene_lists[j])
            matrix[i,j] = sim
    return matrix
            
def make_cosine_similarity_matrix(gene_lists):
    size = len(gene_lists)
    matrix = np.zeros((size,size))
    all_genes = make_all_genes(gene_lists)
    #for gene_list in gene_lists:
    #    vectors.append(list2vec(gene_list, all_genes))
    for i in range(size):
        for j in range(size):
            sim = cosine_similarity(gene_lists[i], gene_lists[j], all_genes)
            matrix[i,j] = sim
    return matrix
    

In [214]:
# Similarity scoring examples
a = ['TP53', 'EGFR', 'XRCC1', "FOXO3", "TLE1"]
b = ['TP53', 'EGFR', 'XRCC1', "AKT1", "GSK3B", 'E2F1']
l = [a,b]
mj = make_jaccard_similarity_matrix(l)
print('jaccard')
print(mj)
print("")
print('cosine')
mc = make_cosine_similarity_matrix(l)
print(mc)

jaccard
[[1.    0.375]
 [0.375 1.   ]]

cosine
[[1.         0.54772256]
 [0.54772256 1.        ]]


## Process a Network Set
- get the set from NDEx
- for each network, extract the gene list
- make a jaccard similarity matrix from the gene list
- make a cosine similarity matrix from the gene list
- choose a score and similarity threshold and score to filter edges for the output network
- make the output network from the similarity matrices, set "cosine" and "jaccard" attributes on edges
- set the name of each node to the name of the corresponding network
- set the "members" attribute of each node to the gene list
- set the style of the network to be the similarity network style
- perform a network layout
- set the "network" attribute of each node to the URL to display the corresponding network in a new tab
- set the name of the network to be the name of the set
- set the derived from attribute to be the UUID of the set
- also make a link in the description point to the set
- save to NDEx

In [217]:
def get_nodes_as_df(network_id, server="www.ndexbio.org"):
    url = "".join(["https://", server, "/v3/networks/", network_id, "/export?type=node"])
    r = requests.get(url)
    df = pd.read_csv(io.StringIO(r.text),sep='\t')
    #print(df)
    return df

def get_node_name_list(network_id, server="www.ndexbio.org"):
    # TODO deal with private networks
    df = get_nodes_as_df(network_id, server=server)
    return list(df["name"])

# TODO
# filter the gene list to include only known human gene names
def remove_non_genes(list_of_strings):
    return None
 
# TODO
# also find genes in specified attributes such as "genes" or "members"
def get_gene_list(network_id):
    names = get_node_name_list(network_id)
    return names
    
def get_gene_lists_from_network_ids(network_ids):
    gene_lists = []
    for network_id in network_ids:
        gene_lists.append(get_gene_list(network_id))
    return gene_lists
        
def get_network_summaries_from_network_ids(network_ids):
    network_summaries = []
    for network_id in network_ids:
        network_summaries.append(ndx.get_network_summary(network_id))
    return network_summaries

# In addition to commputing the similarity matrix, this function also returns
# lists of the network ids, network summaries, and gene lists, each in the same order
def make_similarity_matrix_from_set(network_set_id, similarity_metric="jaccard"):
    network_set = ndx.get_network_set(network_set_id)
    network_ids = network_set.get("networks")  # this list establishes the order of the network ids from here on
    gene_lists = get_gene_lists_from_network_ids(network_ids)
    network_summaries = get_network_summaries_from_network_ids(network_ids)
    if similarity_metric == "jaccard":
        return make_jaccard_similarity_matrix(gene_lists), network_ids, network_summaries, gene_lists
    if simlarity_metric == "cosine":
        return make_cosine_similarity_matrix(gene_lists), network_ids, network_summaries, gene_lists
    print("unknown similarity score" + similarity_metric)
    return None, network_ids, network_summaries, gene_lists

def make_networkx_similarity_network(similarity_matrix, threshold, network_ids, network_summaries, gene_lists):
    sim_network = nx.from_numpy_matrix(similarity_matrix > threshold)
    sim_network.remove_edges_from(nx.selfloop_edges(sim_network))
    # add the weights. from_numpy_matrix 
    for (i, j) in sim_network.edges():
        sim_network[i][j]["weight"] = similarity_matrix[i][j]
    attributes = {}
    for index in range(len(network_ids)):
        network_id = network_ids[index]
        network_summary = network_summaries[index]
        name = network_summary.get("name")
        description = network_summary.get("description").replace(',', " ").replace(';',' ')
        members = gene_lists[index]
        n_members = len(members)
        attributes[index]={"label":name,
                           "description":description, 
                           "members":members, 
                           "n_members": len(members),
                           "log2members": log(len(members),2)
                          }
    nx.set_node_attributes(sim_network, attributes)
    return sim_network

### Create the similarity network

In [221]:
jaccard, network_ids, network_summaries, gene_lists = make_similarity_matrix_from_set(example_set_id)
#print(jaccard)
#print(network_ids)
#print(network_summaries)
sim_network = make_networkx_similarity_network(jaccard, 0.2, network_ids, network_summaries, gene_lists)

### Apply a style and a layout

In [223]:
ncx=ndex2.create_nice_cx_from_networkx(sim_network)
# name the network based on the set name
ncx.set_name(example_set.get("name") + " similarity map")

TEMPLATE_UUID = 'f9bdb308-4be7-11ec-b3be-0ac135e8bacf'

# Copy the style from a template network in NDEx and apply it to our NiceCX network
ncx.apply_template(SERVER, TEMPLATE_UUID)

# Perform a layout on the networkx network and apply it to the NiceCX network
#pos = nx.drawing.layout.spiral_layout(sim_network, scale=700)
pos = nx.drawing.layout.kamada_kawai_layout(sim_network, scale=1400)

new_cart_layout = []
for node_id, coordinates in pos.items():
    new_cart_layout.append({
        'node': node_id,
        'x': coordinates[0],
        'y': -coordinates[1]   # See note below!!!
    })
ncx.set_opaque_aspect(ndex2.constants.CARTESIAN_LAYOUT_ASPECT, new_cart_layout)



### View the network

In [224]:
from cyjupyter import Cytoscape
cx_network = ncx.to_cx()
Cytoscape(data=cx_network, format='cx')

Generating CX


Cytoscape(data=[{'numberVerification': [{'longNumber': 281474976710655}]}, {'metaData': [{'name': 'cyVisualPro…

### Upload the network to NDEx 
and show the URL of the network's page as well as its UUID

In [225]:
network_url = ncx.upload_to(SERVER, NDEXUSER, NDEXPASSWORD)
network_uuid = network_url.split("/")[-1]

print ('=' *15)
print("Network's URL (click to view!): " + NETWORK_BASE_URL + network_uuid)
print("Network's UUID: " + network_uuid)

Generating CX
Network's URL (click to view!): http://ndexbio.org/viewer/networks/9ab14365-508a-11ec-b3be-0ac135e8bacf
Network's UUID: 9ab14365-508a-11ec-b3be-0ac135e8bacf
