In [None]:
import networkx as nx
import pandas as pd

# Merging Multiple Networks
A more directed approach using a list of genes of interests could yield results that are easier to interpert.

# 1. Combining all networks and keeping edges between selected nodes

In [1]:
file_list = ["network1.tsv", "network2.tsv", "network3.tsv"] 
Glist = [nx.read_edgelist(fl, delimiter="\t", data=[("mr", float,)]) for fl in file_list]
Gcomb = nx.compose_all(Glist)
print("Combined network nodes and edges before filtering:", len(Gcomb.nodes), len(Gcomb.edges))

keep_set = set(pd.read_csv("genes_to_keep.txt")["GeneIDs"].tolist())
Gcomb = Gcomb.subgraph(keep_set)
print("Combined network nodes and edges before filtering:", len(Gcomb.nodes), len(Gcomb.edges))

nx.write_edgelist(Gcomb, "Gcomb_selected_nodes.edgelist", comments='#', delimiter='\t', data=False, encoding='utf-8')

NameError: name 'nx' is not defined

# Merging Multiple Networks
In this case I have a couple networks that are composed on a large number of relatively small connected components.\
The goal of the scripts below is to load these separate networks and combine them together into a single network.\
If we don't care about connected components and just want to combine non-redundant edges then we can use python.\
Otherwise, networkx has some useful functions to handle these coexpression networks.

# 2. Combining Filtered Connected Components - NetworkX implementation

In [None]:
# Load the the edges tables. The code was written to iterate over any file combinations
# If you have a third weights column, include the data.
file_list = ["network1.tsv", "network2.tsv", "network3.tsv"] 
Glist = [nx.read_edgelist(fl, delimiter="\t", data=[("weight", float,)]) for fl in file_list]
print("Number of nodes in each network:", [len(G.nodes) for G in Glist])

# Generate a list of lists that contain all the connected components of each network
cclist = [[G.subgraph(c).copy() for c in nx.connected_components(G)] for G in Glist]
print("Number of connected components before filtering:", [len(cc) for cc in cclist])

# This step is optional - A list of gene IDs can be used to keep CCs that contain these
# In my case it was a simple text file with a GeneIDs column that I converted to a list
# With the list of connected components we can filter the ones that don't have the genes of interest
keep_set = set(pd.read_csv("keep_genes.txt")["GeneIDs"].tolist())
# A list of lists that contain the CCs if there's at least one overlap between both gene sets
# https://www.geeksforgeeks.org/python-check-two-lists-least-one-element-common/
cclist_filtered = [[cc for cc in cc_original if set(cc.nodes) & keep_set] for cc_original in cclist]
print("Number of connected components after filtering:", [len(cc) for cc in cclist_filtered])

# Still part of the optional step - Reconstruct the original networks with the filtered CCs
Glist_filtered = [nx.algorithms.operators.all.union_all(ccs) for ccs in cclist_filtered]

# Compose (or combine) all the networks together, while internally avoiding duplicates
# I think if there are duplicate edges networkx keeps the weight of the last network in the list
Gcomb = nx.compose_all(Glist_filtered)
print("Number of nodes in the combined network:", len(Gcomb.nodes))

# Write combined network to 
nx.write_edgelist(Gcomb, "combined.edgelist", comments='#', delimiter='\t', data=False, encoding='utf-8')

In [None]:
# The network can be visualized using the following function - 
nx.draw_networkx(Gcomb, with_labels=False, node_size=25, edgecolors='black', edge_color='b')

# 2. Combining Filtered Connected Components - Pandas implementation

In [None]:
# This implementation could be made easier using list-for loop comprehension as written above
df1 = pd.read_csv("network1.tsv", sep="\t", header=None)
df2 = pd.read_csv("network2.tsv", sep="\t", header=None)
df3 = pd.read_csv("network3.tsv", sep="\t", header=None)


df1.columns = ["n1", "n2", "mr"]
df2.columns = ["n1", "n2", "mr"]
df3.columns = ["n1", "n2", "mr"]

# Add all the rows of all the tables together
comb = pd.concat([df1, df2, df3]).reset_index(drop=True)
# Use frozenset to find duplicate edges regardless of order. frozensets are hashable
# https://stackoverflow.com/questions/51182228/python-delete-duplicates-in-a-dataframe-based-on-two-columns-combinations
comb = comb[~comb[['n1', 'n2']].apply(frozenset, axis=1).duplicated()].reset_index(drop=True)


comb.to_csv("combined.tsv", sep="\t")

In [None]:
# To make a network out of the comb edges dataframe 
# we need a list of edges that are separated by spaces
comb_ls = [row[0]+" "+row[1] for _, row in comb.iterrows()]
G = nx.parse_adjlist(comb_ls)

# 3. Other not-so-helpful NetworkX code (that works)

In [None]:
# If we have a list of genes we want to keep in the network, we could take a simple approach
# and in each network find all the edges that contain these genes and build a new network with these.
# Then, if we want to merge the networks we can simply use the compose_all function

file_list = [
            "kremling_all_FPKM_10_cytoscape.tsv",
             "doll_FPKM_10_cytoscape.tsv",
             "282F_FPKM_10_cytoscape.tsv",
             "stelpflug_FPKM_10_cytoscape.tsv",
            ]

keep_set = set(pd.read_csv("filter_genes_sm.txt")["GeneIDs"].tolist())
Glist = [nx.read_edgelist(fl, delimiter="\t", data=[("mr", float,)]) for fl in file_list]
Glist = [nx.Graph(G.edges(keep_set)) for G in Glist]
print("Number of nodes in each network:", [len(G.nodes) for G in Glist])

Gcomb = nx.compose_all(Glist)
print("Number of nodes in the combined network:", len(Gcomb.nodes))

# Write combined network to 
nx.write_edgelist(Gcomb, "test2.edgelist", comments='#', delimiter='\t', data=False, encoding='utf-8')

In [None]:
# If we have multiple networks with many connected components (cc) that we want to combine,
# I thought that one way to combine would be to find all cc pairs that have at least n intersecting
# nodes in each cc and keep those. If a cc from one newtork doesn't overlapenough with any cc 
# from another network then ignore it. Repeat this for all network pairs and then merge all the 
# pair-merged networks into a single large network. 

Glist = []
# Loop over all combinations of pairs of networks
for cc_pair in itertools.combinations_with_replacement(cclist_filtered,2):
    G1 = cc_pair[0].copy()
    G2 = cc_pair[1].copy()
    Gtmplist = []
    for cca in G1:
        for ccb in G2:
            # Doesn't matter if we use cca or ccb to create Gtmp. 
            Gtmp = cca.copy()
            Gtmp.remove_nodes_from([n for n in cca.nodes if n not in ccb.nodes])
            if len(Gtmp) >= 2: # In this case n=2
                Gtmplist.append(nx.compose_all([cca, ccb]))
    Glist.append(nx.compose_all([cc for cc in Gtmplist]))

nx.write_edgelist(nx.compose_all(Glist), "test3.edgelist", comments='#', delimiter='\t', data=False, encoding='utf-8')