Create a similarity map from a set of network or the communities in a hierarchical model

In [56]:
from math import*
import numpy as np
import pandas as pd
import ndex2
import getpass
import requests
import io
import igraph
import networkx as nx

from IPython.display import display, HTML

In [2]:
# Watch out for typos and be sure to hit enter in the field to set the value!!!
NDEXUSER = getpass.getpass()

········


In [3]:
# Watch out for typos and be sure to hit enter in the field to set the value!!!
NDEXPASSWORD = getpass.getpass()

········


In [4]:
print(NDEXUSER)
print(NDEXPASSWORD)
ndx=ndex2.Ndex2("http://www.ndexbio.org", NDEXUSER, NDEXPASSWORD)
ndx

dexterpratt
cytoscaperules


<ndex2.client.Ndex2 at 0x1fb9fc926a0>

In [45]:
example_set_id = "d0096935-4b05-11ec-b3be-0ac135e8bacf"
def describe_network_set(set_id):
    set = ndx.get_network_set(set_id)
    display(HTML("<h2>" + set.get("name") + "</h2>"))
    display(HTML(set.get("description")))
    for network_id in set.get("networks"):
        summary = ndx.get_network_summary(network_id)
        display(HTML("<h4>" + summary.get("name") + "</h4>"))
        if summary.get("description"):
            display(HTML(summary.get("description")))
    return set

#example_set = describe_network_set(example_set_id)
example_set = ndx.get_network_set(example_set_id)
example_set

{'name': 'similarity_example_set',
 'description': 'NCI-PID networks for testing similarity network generation',
 'ownerId': 'e3b72ac7-0a33-11e5-ac0f-000c29cb28fb',
 'networks': ['48ad4218-78c3-11e8-a4bf-0ac135e8bacf',
  '4a34f38b-78c3-11e8-a4bf-0ac135e8bacf',
  '4beeb17e-78c3-11e8-a4bf-0ac135e8bacf',
  '4d8ea5e1-78c3-11e8-a4bf-0ac135e8bacf'],
 'showcased': False,
 'properties': {'reference': ''},
 'externalId': 'd0096935-4b05-11ec-b3be-0ac135e8bacf',
 'isDeleted': False,
 'modificationTime': 1637524887884,
 'creationTime': 1637524887884}

## Similarity Metrics
- Jacquard
- Cosine

In [46]:
# Jacquard similarity for two sets:

# The ratio of size of the intersection to the size of the union
 
def jaccard_similarity(x,y):
 
 intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
 union_cardinality = len(set.union(*[set(x), set(y)]))
 return intersection_cardinality/float(union_cardinality)

# Cosine similarity for two sets:
# loop through each list, find distinct genes and map them to a
# unique number starting at zero
def make_all_genes(gene_lists):
    all_genes = {}
    i = 0
    for gene_list in gene_lists:
        for gene in gene_list:
            if gene not in all_genes:
                all_genes[gene] = i
                i += 1
    return all_genes

def cosine_similarity(list_a, list_b, all_genes):
    a = list2vec(list_a, all_genes)
    print(a)
    b = list2vec(list_b, all_genes)
    print(b)
    # use numpy's dot product to calculate the cosine similarity
    return np.dot(a, b) / np.sqrt(np.dot(a, a) * np.dot(b, b))
    
def list2vec(list, all_genes):
    # create a vector of zeros the length of all genes
    vec = np.zeros(len(all_genes))
    # for each gene in the list, get the index and set that element of the vector to 1
    for gene in list:
        index = all_genes[gene] # fragile, need to test to be sure all_genes contains gene
        vec[index] = 1
    return vec

def make_jaccard_similarity_matrix(gene_lists):
    size = len(gene_lists)
    matrix = np.zeros((size,size))
    for i in range(size):
        for j in range(size):
            sim = jaccard_similarity(gene_lists[i], gene_lists[j])
            matrix[i,j] = sim
    return matrix
            
def make_cosine_similarity_matrix(gene_lists):
    size = len(gene_lists)
    matrix = np.zeros((size,size))
    all_genes = make_all_genes(gene_lists)
    #for gene_list in gene_lists:
    #    vectors.append(list2vec(gene_list, all_genes))
    for i in range(size):
        for j in range(size):
            sim = cosine_similarity(gene_lists[i], gene_lists[j], all_genes)
            matrix[i,j] = sim
    return matrix
    

In [47]:
a = ['TP53', 'EGFR', 'XRCC1', "FOXO3", "TLE1"]
b = ['TP53', 'EGFR', 'XRCC1', "AKT1", "GSK3B", 'E2F1']
l = [a,b]
m = make_jaccard_similarity_matrix(l)
m

array([[1.   , 0.375],
       [0.375, 1.   ]])

In [48]:
a = ['TP53', 'EGFR', 'XRCC1', "FOXO3", "TLE1"]
b = ['TP53', 'EGFR', 'XRCC1', "AKT1", "GSK3B", 'E2F1']
l = [a,b]
m = make_cosine_similarity_matrix(l)
m

[1. 1. 1. 1. 1. 0. 0. 0.]
[1. 1. 1. 1. 1. 0. 0. 0.]
[1. 1. 1. 1. 1. 0. 0. 0.]
[1. 1. 1. 0. 0. 1. 1. 1.]
[1. 1. 1. 0. 0. 1. 1. 1.]
[1. 1. 1. 1. 1. 0. 0. 0.]
[1. 1. 1. 0. 0. 1. 1. 1.]
[1. 1. 1. 0. 0. 1. 1. 1.]


array([[1.        , 0.54772256],
       [0.54772256, 1.        ]])

## Process a Network Set
- get the set from NDEx
- for each network, extract the gene list
- make a jaccard similarity matrix from the gene list
- make a cosine similarity matrix from the gene list
- choose a similarity threshold and type for adding edges to the output network
- make the output network from the similarity matrices, set "cosine" and "jaccard" attributes on edges
- layout? Only use threshold edges?
- set the name of each nodes to be the name of the corresponding network
- set the "members" attribute of each node to the gene list
- set the style of the network to be the similarity network style
 - hide edges below threshold
- set the "network" attribute of each node to the URL to display the corresponding network in a new tab
- set the name of the network to be the name of the set
- set the derived from attribute to be the UUID of the set
- also make a link in the description point to the set
- save to NDEx

In [49]:
def get_nodes_as_df(network_id, server="www.ndexbio.org"):
    url = "".join(["https://", server, "/v3/networks/", network_id, "/export?type=node"])
    r = requests.get(url)
    df = pd.read_csv(io.StringIO(r.text),sep='\t')
    #print(df)
    return df

def get_node_name_list(network_id, server="www.ndexbio.org"):
    # TODO deal with private networks
    df = get_nodes_as_df(network_id, server=server)
    return list(df["name"])

def remove_non_genes(list_of_strings):
    return None
    
def get_gene_list(network_id):
    names = get_node_name_list(network_id)
    return names
    
def get_gene_lists_from_set(set_id):
    set = ndx.get_network_set(set_id)
    gene_lists = []
    for network_id in set.get("networks"):
        gene_lists.append(get_gene_list(network_id))
    return gene_lists
        
def make_similarity_matrix_from_set(set_id, similarity_metric="jaccard"):
    gene_lists = get_gene_lists_from_set(set_id)
    if similarity_metric == "jaccard":
        return make_jaccard_similarity_matrix(gene_lists)
    if simlarity_metric == "cosine":
        return make_cosine_similarity_matrix(gene_lists)
    print("unknown similarity metric" + similarity_metric)
    return None

In [50]:
name_lists = []
for network_id in example_set.get("networks"):
    #print(network_id)
    name_lists.append(get_gene_list(network_id))
name_lists

[['1313',
  'ARF1',
  'ARFGAP1',
  'ASAP1',
  'CD4',
  'CLTA',
  'CLTB',
  'COPA',
  'EREG',
  'GBF1',
  'GGA3',
  'GIT1',
  'GOSR2',
  'HIV Nef',
  'KDELR1',
  'USO1',
  'YKT6',
  'ACTR2',
  'ACTR3',
  'ARHGAP10',
  'AP2A1',
  'AP2M1',
  'ARFGEF1',
  'ARFIP2',
  'CYTH1',
  'CYTH2',
  'PIP5K1A',
  'PLD2',
  'RAC1',
  'GDP',
  'GTP',
  'ATP',
  'phosphatidic acid',
  'DAG',
  'PAPOLA',
  '1-phosphatidyl-1D-myo-inositol 3,4,5-trisphosphate',
  'brefeldin A',
  'PI-4-5-P2',
  'PI-4-P',
  'choline',
  'Phosphatic acid',
  'Phosphatidylcholine'],
 ['ARF1',
  'ARF6',
  'GDP',
  'KALRN',
  'MAPK1',
  'MAPK3',
  'NME1',
  'PIP5K1A',
  'PLD1',
  'PLD2',
  'RAB11A',
  'RAB11FIP3',
  'RAC1',
  'RHOA',
  'TIAM1',
  'phosphatidic acid',
  'GM1',
  'GTP',
  'PLAUR',
  'choline',
  'Phosphatidylcholine'],
 ['ACAP1',
  'ACAP2',
  'ARF6',
  'GDP',
  'GULP1',
  'phosphate',
  'ADAP1',
  'KIF13B',
  'ADRB2',
  'ARRB1',
  'ARRB2',
  'CYTH2',
  'beta Arrestin1-2',
  'AGTR1',
  'CYTH3',
  'EGFR',
  'EGF',
 

In [51]:
jaccard = make_similarity_matrix_from_set(example_set_id)

In [52]:
name_lists = []
for network_id in example_set.get("networks"):
    #print(network_id)
    name_lists.append(get_node_name_list(network_id))
name_lists

[['1313',
  'ARF1',
  'ARFGAP1',
  'ASAP1',
  'CD4',
  'CLTA',
  'CLTB',
  'COPA',
  'EREG',
  'GBF1',
  'GGA3',
  'GIT1',
  'GOSR2',
  'HIV Nef',
  'KDELR1',
  'USO1',
  'YKT6',
  'ACTR2',
  'ACTR3',
  'ARHGAP10',
  'AP2A1',
  'AP2M1',
  'ARFGEF1',
  'ARFIP2',
  'CYTH1',
  'CYTH2',
  'PIP5K1A',
  'PLD2',
  'RAC1',
  'GDP',
  'GTP',
  'ATP',
  'phosphatidic acid',
  'DAG',
  'PAPOLA',
  '1-phosphatidyl-1D-myo-inositol 3,4,5-trisphosphate',
  'brefeldin A',
  'PI-4-5-P2',
  'PI-4-P',
  'choline',
  'Phosphatic acid',
  'Phosphatidylcholine'],
 ['ARF1',
  'ARF6',
  'GDP',
  'KALRN',
  'MAPK1',
  'MAPK3',
  'NME1',
  'PIP5K1A',
  'PLD1',
  'PLD2',
  'RAB11A',
  'RAB11FIP3',
  'RAC1',
  'RHOA',
  'TIAM1',
  'phosphatidic acid',
  'GM1',
  'GTP',
  'PLAUR',
  'choline',
  'Phosphatidylcholine'],
 ['ACAP1',
  'ACAP2',
  'ARF6',
  'GDP',
  'GULP1',
  'phosphate',
  'ADAP1',
  'KIF13B',
  'ADRB2',
  'ARRB1',
  'ARRB2',
  'CYTH2',
  'beta Arrestin1-2',
  'AGTR1',
  'CYTH3',
  'EGFR',
  'EGF',
 

In [53]:
jaccard

array([[1.        , 0.16666667, 0.07894737, 0.09090909],
       [0.16666667, 1.        , 0.07017544, 0.13636364],
       [0.07894737, 0.07017544, 1.        , 0.08045977],
       [0.09090909, 0.13636364, 0.08045977, 1.        ]])

In [55]:
# Create graph, A.astype(bool).tolist() or (A / A).tolist() can also be used.
g = igraph.Graph.Adjacency((jaccard > 0.08).tolist())

# Add edge weights and node labels.
g.es['weight'] = jaccard[jaccard.nonzero()]
#g.vs['label'] = node_names  # or a.index/a.columns
g.to_cx()

AttributeError: 'Graph' object has no attribute 'to_cx'

In [62]:
n = nx.from_numpy_matrix(jaccard > 0.08)
ncx=ndex2.create_nice_cx_from_networkx(n)
ncx

<ndex2.nice_cx_network.NiceCXNetwork at 0x1fb9fe604c0>

## Process a Hierarchy
- get the hierarchy
- for each community, get the member list
- make the matrices and the network as above
- set the name of each node to be the name of the community
- set the "members" attribute of each node to be the members of the community
- ...maybe just copy ALL attributes of the hierarchy nodes
- set the style of the network to be the similarity network style
- set the derived from attribute to the UUID of the hierarchy
- make a link in the description open the hierarchy in a new tab
- ***do we prune this to avoid re-creating the parent-child links? Or is a threshold sufficient?*
- save to NDEx

In [63]:
from cyjupyter import Cytoscape
nice_cx_network_viz = ncx.to_cx()
Cytoscape(data=nice_cx_network_viz, format='cx')

Generating CX


Cytoscape(data=[{'numberVerification': [{'longNumber': 281474976710655}]}, {'metaData': [{'name': 'nodes', 'el…