In [1]:
import pandas as pd
import numpy as np
import json
import networkx as nx
from tqdm import tqdm
from compound import Compound
from reaction import Reaction
from graph import Graph
from data import Data

# read data from csv
cpds = pd.read_csv('../GNN_toxic/data/raw/compounds_final.csv', index_col=0) # containing toxicity
rxns = pd.read_csv('data/reactions_final.csv', index_col=0)
pairs = pd.read_csv('data/pairs_final.csv', index_col=0)
cofactors = pd.read_csv('data/original/cofactors_KEGG.csv')

# create class instances
data = Data()
graph = Graph(pairs=pairs)

# Create a Compound object for each row in the DataFrame and add it to the data
for index, row in cpds.iterrows():
    entry = row['Entry']
    name = row['Names']
    formula = row['Formula']
    mw = row['mol_weight']
    smiles = row['SMILES']
    is_cofactor = row['Entry'] in cofactors['Entry'].values
    is_toxic = row['toxic']

    compound = Compound(entry, name, formula, mw, smiles, is_cofactor, is_toxic)
    data.add_element('compound', compound)

# Create a Reaction object for each row in the DataFrame and add it to the data
for index, row in rxns.iterrows():
    entry = row['Entry']
    name = row['Names']
    compounds = row['Compound']
    enzyme = row['EC Number']

    reaction = Reaction(entry, name, compounds, enzyme)
    data.add_element('reaction', reaction)

# CREATE GRAPH
graph.create_graph(data=data, pairs=pairs)

graph.calculate_edge_mol_weight(data)
graph.calculate_smiles_similarity(data)

# nodes: 8591 
# edges: 30026


100%|██████████| 8591/8591 [00:10<00:00, 837.96it/s] 
100%|██████████| 30026/30026 [00:00<00:00, 379792.86it/s]
100%|██████████| 30026/30026 [00:51<00:00, 580.31it/s]


In [2]:
butanol_KEGG = 'C06142'

src_list = ['C00024', 'C00074', 'C00022']

In [3]:
path, smiles_sim, idx, com_changes, idxcom = graph.constrained_shortest_path(src_list[0], butanol_KEGG, weight='mol_weight')
for i in path: print(i)

['C00024', 'C00136', 'C01412', 'C06142']
['C00024', 'C00332', 'C00136', 'C01412', 'C06142']
['C00024', 'C00083', 'C00332', 'C00136', 'C01412', 'C06142']
['C00024', 'C05231', 'C00877', 'C00136', 'C01412', 'C06142']
['C00024', 'C00894', 'C05668', 'C05989', 'C00083', 'C00332', 'C00136', 'C01412', 'C06142']
['C00024', 'C11062', 'C00877', 'C00136', 'C01412', 'C06142']
['C00024', 'C11062', 'C02331', 'C00877', 'C00136', 'C01412', 'C06142']
['C00024', 'C00332', 'C01144', 'C00877', 'C00136', 'C01412', 'C06142']
['C00024', 'C00332', 'C03561', 'C00877', 'C00136', 'C01412', 'C06142']
['C00024', 'C00332', 'C01144', 'C03561', 'C00877', 'C00136', 'C01412', 'C06142']


In [5]:
idx

0

In [12]:
for idx, i in enumerate(path):
    print(smiles_sim[idx] / len(i))

0.4309227663454025
0.3458773862240911
0.2905928327789166
0.29412455580030566
0.1950035088004592
0.29248849223572015
0.2519242059572934
0.2521211355892645
0.2521211355892645
0.22060599364060646


In [6]:
smiles_sim

[1.72369106538161,
 1.7293869311204555,
 1.7435569966734998,
 1.7647473348018339,
 1.7550315792041327,
 1.754930953414321,
 1.763469441701054,
 1.7648479491248517,
 1.7648479491248517,
 1.7648479491248517]