Create a similarity map from a set of network or the communities in a hierarchical model

In [4]:
from math import*
import numpy as np


## Similarity Metrics
Jacquard
Cosine

In [18]:
# Jacquard similarity for two sets:

# The ratio of size of the intersection to the size of the union
 
def jaccard_similarity(x,y):
 
 intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
 union_cardinality = len(set.union(*[set(x), set(y)]))
 return intersection_cardinality/float(union_cardinality)

# Cosine similarity for two sets:
# loop through each list, find distinct genes and map them to a
# unique number starting at zero
def make_all_genes(gene_lists):
    all_genes = {}
    i = 0
    for gene_list in gene_lists:
        for gene in gene_list:
            if gene not in all_genes:
                all_genes[gene] = i
                i += 1
    return all_genes

def cosine_similarity(list_a, list_b, all_genes):
    a = list2vec(list_a, all_genes)
    print(a)
    b = list2vec(list_b, all_genes)
    print(b)
    # use numpy's dot product to calculate the cosine similarity
    return np.dot(a, b) / np.sqrt(np.dot(a, a) * np.dot(b, b))
    
def list2vec(list, all_genes):
    # create a vector of zeros the length of all genes
    vec = np.zeros(len(all_genes))
    # for each gene in the list, get the index and set that element of the vector to 1
    for gene in list:
        index = all_genes[gene] # fragile, need to test to be sure all_genes contains gene
        vec[index] = 1
    return vec

def make_jaccard_similarity_matrix(gene_lists):
    size = len(gene_lists)
    matrix = np.zeros((size,size))
    for i in range(size):
        for j in range(size):
            sim = jaccard_similarity(gene_lists[i], gene_lists[j])
            matrix[i,j] = sim
    return matrix
            
def make_cosine_similarity_matrix(gene_lists):
    size = len(gene_lists)
    matrix = np.zeros((size,size))
    all_genes = make_all_genes(gene_lists)
    #for gene_list in gene_lists:
    #    vectors.append(list2vec(gene_list, all_genes))
    for i in range(size):
        for j in range(size):
            sim = cosine_similarity(gene_lists[i], gene_lists[j], all_genes)
            matrix[i,j] = sim
    return matrix
    

In [19]:
a = ['TP53', 'EGFR', 'XRCC1', "FOXO3", "TLE1"]
b = ['TP53', 'EGFR', 'XRCC1', "AKT1", "GSK3B", 'E2F1']
l = [a,b]
m = make_jaccard_similarity_matrix(l)
m

array([[1.   , 0.375],
       [0.375, 1.   ]])

In [20]:
a = ['TP53', 'EGFR', 'XRCC1', "FOXO3", "TLE1"]
b = ['TP53', 'EGFR', 'XRCC1', "AKT1", "GSK3B", 'E2F1']
l = [a,b]
m = make_cosine_similarity_matrix(l)
m

[1. 1. 1. 1. 1. 0. 0. 0.]
[1. 1. 1. 1. 1. 0. 0. 0.]
[1. 1. 1. 1. 1. 0. 0. 0.]
[1. 1. 1. 0. 0. 1. 1. 1.]
[1. 1. 1. 0. 0. 1. 1. 1.]
[1. 1. 1. 1. 1. 0. 0. 0.]
[1. 1. 1. 0. 0. 1. 1. 1.]
[1. 1. 1. 0. 0. 1. 1. 1.]


array([[1.        , 0.54772256],
       [0.54772256, 1.        ]])

## Process a Network Set
- get the set from NDEx
- for each network, extract the gene list
- make a jaccard similarity matrix from the gene list
- make a cosine similarity matrix from the gene list
- choose a similarity threshold and type for adding edges to the output network
- make the output network from the similarity matrices, set "cosine" and "jaccard" attributes on edges
- layout? Only use threshold edges?
- set the name of each nodes to be the name of the corresponding network
- set the "members" attribute of each node to the gene list
- set the style of the network to be the similarity network style
 - hide edges below threshold
- set the "network" attribute of each node to the URL to display the corresponding network in a new tab
- set the name of the network to be the name of the set
- set the derived from attribute to be the UUID of the set
- also make a link in the description point to the set
- save to NDEx

## Process a Hierarchy
- get the hierarchy
- for each community, get the member list
- make the matrices and the network as above
- set the name of each node to be the name of the community
- set the "members" attribute of each node to be the members of the community
- ...maybe just copy ALL attributes of the hierarchy nodes
- set the style of the network to be the similarity network style
- set the derived from attribute to the UUID of the hierarchy
- make a link in the description open the hierarchy in a new tab
- ***do we prune this to avoid re-creating the parent-child links? Or is a threshold sufficient?*
- save to NDEx