In [1]:
import itertools as it
import pandas as pd
import csv
import math
import numpy as np

In [2]:
# Function to find the shortest path between two nodes of a graph 
def BFS_SP(graph, start, goal): 
    explored = [] 
      
    # Queue for traversing the graph in the BFS 
    queue = [[start]] 
      
    # If the desired node is reached 
    if start == goal: 
        print("Same Node") 
        return
      
    # Loop to traverse the graph with the help of the queue 
    while queue: 
        path = queue.pop(0) 
        node = path[-1] 
          
        # Codition to check if the current node is not visited 
        if node not in explored: 
            neighbours = graph[node] 
              
            # Loop to iterate over the neighbours of the node 
            for neighbour in neighbours: 
                new_path = list(path) 
                new_path.append(neighbour) 
                queue.append(new_path) 
                  
                # Condition to check if the neighbour node is the goal 
                if neighbour == goal: 
                    print(f"Shortest path for ({start}) --> ({goal}) = ", [*new_path])
                    return new_path
            explored.append(node) 
  
    # Condition when the nodes are not connected 
    print("No path.") 
    return

In [3]:
# Make a dictionary for all nodes and their connections.
edges_table = pd.read_csv("edges_table.csv", encoding="latin-1")
sources = edges_table["source_id"].tolist()
targets = edges_table["target_id"].tolist()
sources_label = edges_table["source"].tolist()
targets_label = edges_table["target"].tolist()
color_col = edges_table["color_col"].tolist()
weight = edges_table["weight"].tolist()
thickness = edges_table["thickness"].tolist()

all_species = list(set(sources + targets))

i = 0
graph_dict = {}
for i in range(0, len(all_species)):
    graph_dict[all_species[i]] = []

for i in range(0, len(sources)):
    graph_dict[sources[i]].append(targets[i])

In [4]:
edges_table

Unnamed: 0,source,source_id,target,target_id,weight,color_col,thickness
0,apoptosis::go:GO:0006915,GO:0006915,ROS::chebi:CHEBI:26523,CHEBI:26523,1,0.906144,86
1,inflammation::go:GO:0006954,GO:0006954,LPS::uniprot:O14896,uniprot:O14896,1,0.854245,82
2,insulin secretion::go:GO:0030073,GO:0030073,glucose::chebi:CHEBI:17234,CHEBI:17234,1,0.839019,53
3,FBS::uniprot:Q9HAH7,uniprot:Q9HAH7,heat::pfam:PF02985,pfam:PF02985,1,0.075858,38
4,cell death::go:GO:0008219,GO:0008219,ROS::chebi:CHEBI:26523,CHEBI:26523,1,0.904096,39
...,...,...,...,...,...,...,...
96714,E3,uaz:UAZCE94.p@Y311,Eq::uaz:UAZ4571,uaz:UAZ4571,1,0.924142,1
96715,E3,uaz:UAZCE94.p@Y311,Fpr2::uniprot:P25090,uniprot:P25090,1,0.924142,1
96716,E3,uaz:UAZCE94.p@Y311,Gls1::uniprot:O94925,uniprot:O94925,1,0.924142,1
96717,<U+0394>::uaz:UAZCE94.m,uaz:UAZCE94.m,SMTs::uaz:UAZ534D5473,uaz:UAZ534D5473,1,0.500000,1


In [5]:
all_species_id = []
i=0
for species in all_species:
    if len(species) > 5:
        all_species_id.append(species.split(":")[-2] + ":" + species.split(":")[-1])
    else:
        all_species_id.append(species)

In [6]:
# nCr where n is the number of inputs, r is 2 for pairing.
# the txt file will need to take IDs only. This means we'll have to search the table for the IDs and convert them into full labels.

queries_id = []

with open("query_list.txt", 'r', encoding="UTF-8") as query_list:
    queries_id = query_list.readlines()

queries_id = [line.strip() for line in queries_id]

output_num = math.factorial(len(queries_id))/(2*math.factorial((len(queries_id) - 2)))

q_combinations = it.combinations(queries_id, 2)

In [7]:
queries = []
for query in queries_id:
    i = 0
    for species_id in all_species_id:
        if query == species_id:
            queries.append(all_species[i])
        i += 1

In [8]:
print(f"Printed {output_num} Source, Target pairs.")

stored_paths = {}
for query_pair in q_combinations:
    source, target = query_pair
    stored_paths[f"{source} -> {target}"] = BFS_SP(graph_dict, source, target)

Printed 45.0 Source, Target pairs.
No path.
No path.
No path.
No path.
No path.
No path.
No path.
No path.
No path.
No path.
Shortest path for (uniprot:Q9BXA5) --> (CHEBI:145365) =  ['uniprot:Q9BXA5', 'CHEBI:26806', 'pubchem:25145403', 'mesh:D015427', 'mesh:D009362', 'mesh:D005910', 'CHEBI:145365']
Shortest path for (uniprot:Q9BXA5) --> (pubchem:53481406) =  ['uniprot:Q9BXA5', 'CHEBI:26806', 'uniprot:P08865', 'uniprot:P28347', 'GO:0009058', 'pubchem:53481406']
No path.
No path.
No path.
Shortest path for (uniprot:Q9BXA5) --> (mesh:D001168) =  ['uniprot:Q9BXA5', 'CHEBI:26806', 'uniprot:P08865', 'uniprot:P28347', 'mesh:D001168']
Shortest path for (uniprot:Q9BXA5) --> (CHEBI:5181) =  ['uniprot:Q9BXA5', 'CHEBI:26806', 'pubchem:25145403', 'mesh:D015427', 'uniprot:P31944', 'CHEBI:5181']
Shortest path for (mesh:D055577) --> (CHEBI:145365) =  ['mesh:D055577', 'uniprot:Q13510', 'uniprot:P35520', 'uniprot:O94856', 'uniprot:P01375', 'mesh:D005910', 'CHEBI:145365']
Shortest path for (mesh:D055577)

--------------

In [11]:
expanded_paths = {}
for key in stored_paths:
    breakdown_list = stored_paths[key]
    if breakdown_list != None:
        sub_pairings = []
        for i in range(0, len(breakdown_list) - 1):
            sub_pairings.append(tuple([breakdown_list[i], breakdown_list[i + 1]]))
        expanded_paths[key] = sub_pairings
    if breakdown_list == None:
        expanded_paths[key] = None

In [12]:
subsetted_edges = {
    "source" : [],
    "target" : [],
    "source_label" : [],
    "target_label" : [],
    "color_col" : [],
    "weight" : [],
    "thickness" : []
}

pathless_pairs = []

for key in expanded_paths:
    if expanded_paths[key] != None:
        for pair in expanded_paths[key]:
            i = 0
            for i in range(0, len(sources)):
                if sources[i] == pair[0] and targets[i] == pair[1]:
                    subsetted_edges["source"].append(sources[i])
                    subsetted_edges["target"].append(targets[i])
                    subsetted_edges["source_label"].append(sources_label[i])
                    subsetted_edges["target_label"].append(targets_label[i])
                    subsetted_edges["color_col"].append(color_col[i])
                    subsetted_edges["weight"].append(weight[i])
                    subsetted_edges["thickness"].append(thickness[i])
    else:
        pathless_pairs.append(key)

In [13]:
query_edges = pd.DataFrame.from_dict(subsetted_edges)
no_dupes = pd.DataFrame(np.sort(query_edges[['source','target']], axis=1))
query_edges = query_edges[~no_dupes.duplicated()]
query_edges

Unnamed: 0,source,target,source_label,target_label,color_col,weight,thickness
0,uniprot:Q9BXA5,CHEBI:26806,GPR91::uniprot:Q9BXA5,succinate::chebi:CHEBI:26806,0.924142,1,2
1,CHEBI:26806,pubchem:25145403,succinate::chebi:CHEBI:26806,ND::pubchem:25145403,0.075858,1,1
2,pubchem:25145403,mesh:D015427,ND::pubchem:25145403,I/R::mesh:D015427,0.075858,1,1
3,mesh:D015427,mesh:D009362,ischemia-reperfusion injury::mesh:D015427,metastasis::mesh:D009362,0.924142,1,1
4,mesh:D009362,mesh:D005910,metastasis::mesh:D009362,glioma::mesh:D005910,0.924142,1,1
5,mesh:D005910,CHEBI:145365,glioma::mesh:D005910,Crenolanib::chebi:CHEBI:145365,0.5,1,2
7,CHEBI:26806,uniprot:P08865,succinate::chebi:CHEBI:26806,SA::uniprot:P08865,0.075858,1,1
8,uniprot:P08865,uniprot:P28347,SA::uniprot:P08865,Aa::uniprot:P28347,0.075858,1,1
9,uniprot:P28347,GO:0009058,AA::uniprot:P28347,synthesis::go:GO:0009058,0.924142,1,1
10,GO:0009058,pubchem:53481406,synthesis::go:GO:0009058,GalCer::pubchem:53481406,0.924142,1,1


In [15]:
# Dictionary to easily reference a node id to their common name

all_nodes_table = pd.read_csv("nodes_table_all_labelled.csv", encoding="latin-1")
node_names = all_nodes_table["Id"]
node_labels = all_nodes_table["Label"]
labels_dict = {}

for i in range(0, len(node_names)):
    if len(node_names[i]) > 5:
        node_id_split = node_names[i].split(":")
        node_id = node_id_split[-2] + ":" + node_id_split[-1]
        labels_dict[node_id] = node_labels[i]
    else:
        labels_dict[node_names[i]] = node_labels[i]

In [16]:
im_combined_nodes = subsetted_edges["source"] + subsetted_edges["target"]
query_nodes_im = {"Id" : list(set(im_combined_nodes))}
query_nodes = pd.DataFrame.from_dict(query_nodes_im)

query_nodes_labels = {"Label" : []}
for node_im_id in query_nodes_im["Id"]:
    query_nodes_labels["Label"].append(labels_dict[node_im_id])

In [17]:
query_nodes_labels_df = pd.DataFrame.from_dict(query_nodes_labels)
labelled_query_nodes = pd.concat([query_nodes, query_nodes_labels_df], axis=1)

In [20]:
labelled_query_nodes.to_csv("query_nodes.csv", index=False)
query_edges.to_csv("query_edges.csv", index=False)