In [None]:
import graphAlgorithms as ga

Here we want to compare two networks in detail in order to investigate if specific nodes (genes) / gene areas are changing/ are affected by the condition.

# Load Networks



In [None]:
#location where the raw data files are stored, it is set to run from the installation folder
#- if applicable please change or CHANGE to the location of your networks

graph_location = "../networks/edgelists/"

#location where output should be saved
#Please set location
location = ""

below an example on how to load an Edgelist with column headings into a NetworkX Graph object

There are multiple examples on how to load different formats in the import and export networks notebook

In [None]:
import glob
import pandas as pd
import networkx as nx
import numpy as np

In [None]:

labels = []
networks_graphs = []
cnt = 0
print("load networks")
#gets all files located in the specified folder that end on .edgelist
#CHANGE the ending if your files end differently
for path in glob.glob(graph_location +"*.edgelist"):
    if cnt < 2:
        #you can specify that only part of the file name should be used as network name for later identification
        name =  path.split("/")[-1].replace(".rds.edgelist", "")


        #read the edgelist file as a dataframe
        fh = pd.read_csv(path, sep="\t")
        #convert it into a NetworkX graph G and specify the column names of the node pairs
        G=nx.from_pandas_edgelist(fh, "V1", "V2")

        #if you have an unweighted network assign all edges the same edge weight - here a value of 1 is assigned
        for u, v, d in G.edges(data=True):
            d['weight'] = 1


        #save the graph objects to a list (only suitable if small networks are processed)
        #this is the main objects used for the examples below, which contains all networks
        networks_graphs.append(G)
        labels.append(name)




        print("loaded", name)
    cnt = cnt + 1

convert the networks into the used format

In [None]:
networks = ga.get_node_similarity.preprocess_graph(networks_graphs, attribute="weight")

get union of all nodes 

In [None]:
nodes = []
for net in networks_graphs:
    for node in net.nodes():
        if node not in nodes:
            nodes.append(node)

mapp node names to ID (this is mainly used for node & edge similarity functions)

In [None]:
network_lists, mapping = ga.get_node_similarity.preprocess_node_list(networks)

save mapping for later

In [None]:
import pickle

with open(location + "node_id_mapping_network_network.pckl", "wb") as f:
    pickle.dump(mapping, f, protocol=4)

OPTIONAL: create reversed mapping object

In [None]:
reverse_mapping = ga.distances.node_edge_similarities.reverse_node_edge_mapping(mapping)

# Nodes

we compare based on different centrality measures how the node location in the network changes and which nodes are the most similar or most different ones.
Here we make use of the centrality ranks (as also estimated in the Network clustering pipeline).

In [None]:
sorted_nodes = []

for graph in networks_graphs:
    temp = ga.distances.node_edge_similarities.sort_node_list(graph, mapping, degree_centrality=True, closeness_centrality=True, betweenness=True, k=None, as_str=False)
    
    sorted_nodes.append(temp)

In [None]:
sorted_nodes[0][0].keys()

convert dict output to a dataframe to be more human readable

In [None]:
mapping_ids = list(mapping.values())

In [None]:
import pandas as pd 

df = pd.DataFrame(mapping_ids, 
               columns =['Mapping ID']) 

In [None]:
# add the reversed mapping IDS (original node IDs - here they are Entrez IDs)
entrez = []
for g in mapping_ids:
    entrez.append(reverse_mapping[g])
df["Entrez IDs"] = entrez

In [None]:
for i in range(len(sorted_nodes)):
    item = sorted_nodes[i][0]
    for key in item.keys():
        #ignore "degree" key, since it has not been calculated. We are using degree centrality instead.
        if key != "degree":
            temp = []
            for g in mapping_ids:
                for xx in range(len(item[key])):
                    if item[key][xx] == g:
                        temp.append(xx)
                
            #add to dataframe
            #since the results are in the same order as the network labels 
            #we can use the network label directly as column heading
            df[labels[i]+" Ranking " + key] = temp


In [None]:
df

We are interessted in knowing which genes change the most between the networks with regards to their network position. Therefore we are going to estimate the rank difference of the median ranks.
This can be done for any of the other parameters as well if it is needed for your analysis in the same way.

In [None]:
change = []

for g in mapping_ids:
    
    val1 = df.loc[df["Mapping ID"] == g][labels[0]+" Ranking average_median"].to_list()[0]
    
    val2 = df.loc[df["Mapping ID"] == g][labels[1]+" Ranking average_median"].to_list()[0]
    
    change.append(abs(val1-val2))  

In [None]:
df_change = pd.DataFrame(list(zip(mapping_ids, entrez, change, df[labels[0]+" Ranking average_median"].to_list(), df[labels[1]+" Ranking average_median"].to_list())), 
               columns =['Mapping ID', 'Entrez IDs', 'Absolute Ranking Difference', labels[0]+' Ranking average_median', labels[1]+' Ranking average_median' ]) 

Sort the dataframe

In [None]:
df_change = df_change.sort_values(by =["Absolute Ranking Difference"], axis=0, ascending=False)

The 20 top genes, which network position changes the most

In [None]:
df_change.head(20)

The 20 top most similar genes, with regards to their network position

In [None]:
df_change.tail(20)

These genes could now for example be functionally enriched or a GSEA could be performed on them (combined or individually).

TODO? Should enrichment be done externally or should we include e.g. the API call to the panther enrichment tool? or simply show the code here?

# Edges

Which edges are common in the 2 networks, which edges are unique to one network. And which edges network position (betweenness) changes the most?

Estimate edge betweenness scores and assign them to the graph objects

In [None]:
print("sort edges after edge betweenness")
bet = []
graphs_with_betweenness = []
for net in networks_graphs:
    edges_betweenness = nx.edge_betweenness_centrality(net)
    bet.append(edges_betweenness)
    #write as new attribute to graph
    temp = nx.set_edge_attributes(net, edges_betweenness, "betweenness")

convert the networks & map edges to IDs

In [None]:
networks = ga.get_edge_similarity.preprocess_graph(networks_graphs, attribute="betweenness")

print("map edges to id")

network_lists, mapping = ga.get_edge_similarity.preprocess_edge_list(networks)

with open(location + "edge_id_mapping_network_network.pckl", "wb") as f:
    pickle.dump(mapping, f, protocol=4)

In [None]:
reverse_mapping = ga.distances.node_edge_similarities.reverse_node_edge_mapping(mapping)

Get shared edges. Returns a dict, where key is mapped edge ID and value is list of network names this edge is present in.

In [None]:
shared = ga.distances.node_edge_similarities.compute_shared_layers(network_lists, labels, is_file=False, in_async=False)

convert to a dataframe to be more human readable

In [None]:
edges = list(reverse_mapping.values())
edge_mapped_IDs = list(reverse_mapping.keys())

df = pd.DataFrame(list(zip(edges, edge_mapped_IDs)), 
               columns =['Edges', 'Mapping ID']) 
    

In [None]:
for label in labels:
    temp = []
    for i in edge_mapped_IDs:
        if label in shared[i]:
            temp.append(1)
        else:
            temp.append(0)
            
    df["In "+label] = temp
    

In [None]:
df

Select all edges that are in both

In [None]:
shared_df = df.loc[(df["In "+labels[0]] == 1) & (df["In "+labels[1]] == 1)]

In [None]:
shared_df

For example you can now functional enrich the genes making up the consistent edges. Or see if this edges belong to some specific modules in the network.

Select unique edges

In [None]:
unique_df = df.loc[((df["In "+labels[0]] == 1) & (df["In "+labels[1]] == 0)) | ((df["In "+labels[0]] == 0) & (df["In "+labels[1]] == 1))]

In [None]:
unique_df

TODO: enrichment / mapping to modules?

# Node areas/ connectivity

which nodes are connected in a similar way and which node areas are different?

## Random walks

For each common node in the 2 networks random walks are performed and their similarity in visited nodes is compared. This allows to identify the most similar/ dissimilar node areas.

For each node 10 * its degree random walks of size 5 are performed. A smaller walk size "scans" a smaller area around the starting node.

In [None]:
performed_walks = ga.get_walk_distances.helper_walks(networks_graphs, nodes, labels, steps=5, number_of_walks=10, degree=True, probabilistic=False, weight ="weight")

Now we are estimating for each starting node how often surrounding nodes/ edges have been visit w.r.t. all the visited nodes/ edges. Depending on your network sizes and selected nodes this can be quite memory intensive.

In [None]:
node_counts, edge_counts, nodes_frc, edges_frc = ga.get_walk_distances.helper_get_counts(labels, networks_graphs, performed_walks)

Now we want to estimate network similarities based on the visited nodes. For each network pair, kendall rank correlation is calculated (of the top 20 nodes) for the same starting node. The mean correlation value of all same node pairs for a network pair is estimated as well as the individual values are calculated and returned.

In [None]:
results_edges, results_nodes, results_edges_p, results_nodes_p, results_edges_all, results_nodes_all, results_edges_p_all, results_nodes_p_all = ga.get_walk_distances.helper_walk_sim(networks_graphs, performed_walks, nodes, labels, top=20, undirected=False, return_all = True, nodes_ranked=nodes_frc, edges_ranked=edges_frc)

In [None]:
#map results to a dataframe
df = pd.DataFrame(list(zip(nodes, results_nodes_all[(labels[0], labels[1])])), 
               columns =['Entrez ID', 'Correlation']) 

#sort after correlation

df = df.sort_values(by =["Correlation"], axis=0, ascending=False)

In [None]:
df.head(20)

In [None]:
df.tail(20)

TODO: Nodes in this areas can again be functionally enriched and/ or their modules can be investigated.

Probabable remove from here on!!!!!!

# Community/ Module detection

We will detect modules in the graphs and map the previously identified similar/ dissimilar areas to them.
Here we will only use a simple community detection method. For more algorithms, evaluation and ensembl methods please refer to the Community notebook.

For the example the walktrap algorithm is used, which is based on non probabilistic random walks.

In [None]:
communities = []

for graph in networks_graphs:
    c = ga.communities.walktrap(graph, return_object=False)
    #convert into another format
    con = ga.communities.convert_communities(c)
    
    communities.append(con)

How similar are the detected communities/ modules w.r.t their nodes?

To answer this we transform each community into a subgraph and compare their nodes (if they have at least 1 edge).
For similar modules the edges can be compared as well.

In [None]:
subgraphs = []
com_labels = []
for i in range(len(communities)):
    com = communities[i]
    graph = networks_graphs[i]
    
    for k in com.keys():
        if len(graph.subgraph(com[k]).edges()) > 1:
            subgraphs.append(graph.subgraph(com[k]))
       
            com_labels.append(labels[i]+"_"+str(k))

Following are the same functions as applied in the Nodes section in the Network clustering notebook. For explanations refer to this section.


In [None]:
sub_networks = ga.get_node_similarity.preprocess_graph(subgraphs, attribute="weight")

sub_network_lists, sub_mapping = ga.get_node_similarity.preprocess_node_list(sub_networks)

with open(location + "node_id_mapping_subgraphs.pckl", "wb") as f:
    pickle.dump(sub_mapping, f, protocol=4)
    
sorted_nodes, shared_nodes, binary, centrality_values = ga.get_node_similarity.sort_list_and_get_shared(sub_network_lists, sub_mapping, subgraphs, com_labels, degree_centrality=True, closeness_centrality=True, betweenness=True, degree=False, in_async=False)

In [None]:
jd, per = ga.node_edge_similarities.shared_elements_multiple(sub_network_lists, labels=com_labels, percentage=True, jaccard=True, jaccard_similarity=False, in_async=False, is_file=False)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

Do the communities in the different networks contain similar nodes?

In [None]:
fig, ax = plt.subplots(figsize=(10,8))  

sns.heatmap(jd, annot=False, ax=ax, xticklabels=com_labels, yticklabels=com_labels)

In [None]:
fig, ax = plt.subplots(figsize=(10,8))  

sns.heatmap(per, annot=False, ax=ax, xticklabels=com_labels, yticklabels=com_labels)