### Main notebook

In [1]:
import pandas as pd
import numpy as np
import json
import networkx as nx
from tqdm import tqdm

# Add path 
import sys
sys.path.append('../')
from compound import Compound
from reaction import Reaction
from graph import Graph
from data import Data

# suppres rdkit warnings
import rdkit
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')

# read data from csv
cpds = pd.read_csv('../data/compounds_final.csv', index_col=0) # containing toxicity
rxns = pd.read_csv('../data/reactions_final.csv', index_col=0)
pairs = pd.read_csv('../data/pairs_final_RPAIRS_pred.csv', index_col=0)
cofactors = pd.read_csv('../data/original/cofactors_KEGG.csv')

# create class instances
data = Data()
graph = Graph(pairs=pairs)

# Create a Compound object for each row in the DataFrame and add it to the data
for index, row in cpds.iterrows():
    entry = row['Entry']
    name = row['Names']
    formula = row['Formula']
    mw = row['mol_weight']
    smiles = row['SMILES']
    is_cofactor = row['Entry'] in cofactors['Entry'].values
    # is_toxic = row['toxic']
    is_polymer = row['polymer']

    compound = Compound(entry, name, formula, mw, smiles, is_cofactor, is_polymer)
    data.add_element('compound', compound)

# Create a Reaction object for each row in the DataFrame and add it to the data
for index, row in rxns.iterrows():
    entry = row['Entry']
    name = row['Names']
    compounds = row['Compound']
    enzyme = row['EC Number']

    reaction = Reaction(entry, name, compounds, enzyme)
    data.add_element('reaction', reaction)

In [2]:
# CREATE GRAPH
graph.create_graph(data=data, pairs=pairs)

graph.calculate_edge_mol_weight(data)
graph.calculate_smiles_similarity(data)

# nodes: 7997 
# edges: 11783


100%|██████████| 7997/7997 [00:09<00:00, 836.20it/s]
100%|██████████| 11783/11783 [00:00<00:00, 400209.61it/s]
100%|██████████| 11783/11783 [00:25<00:00, 461.62it/s]


In [3]:
######### VALIDATION SET FROM nicepath ###########
test_cases = pd.read_csv('../data/original/test_cases.csv')
test_cases['source'] = test_cases['Pathway '].apply(lambda x: x.split(',')[0])
test_cases['target'] = test_cases['Pathway '].apply(lambda x: x.split(',')[len(x.split(','))-1])
test_cases['paths_list'] = test_cases['Pathway '].apply(lambda x: x.split(','))

''' Get just a single pathway from test_cases for testing '''
# test_cases = test_cases.iloc[11:1]

paths = graph.validate(test_cases, 'mol_weight')

  0%|          | 0/50 [00:00<?, ?it/s]

Searching for pathway between C00082 and C01533


  2%|▏         | 1/50 [00:00<00:25,  1.93it/s]

Searching for pathway between C00223 and C00761


  4%|▍         | 2/50 [00:00<00:21,  2.28it/s]

Searching for pathway between C00811 and C02887


  6%|▌         | 3/50 [00:01<00:20,  2.33it/s]

Searching for pathway between C00079 and C00903


  8%|▊         | 4/50 [00:01<00:20,  2.24it/s]

Searching for pathway between C00223 and C17750


 10%|█         | 5/50 [00:02<00:20,  2.17it/s]

Searching for pathway between C06561 and C01460


 12%|█▏        | 6/50 [00:02<00:16,  2.64it/s]

Searching for pathway between C05903 and C12636
Searching for pathway between C01477 and C04900


 16%|█▌        | 8/50 [00:02<00:12,  3.49it/s]

Searching for pathway between C05903 and C11620


 18%|█▊        | 9/50 [00:03<00:11,  3.52it/s]

Searching for pathway between C05905 and C16299


 20%|██        | 10/50 [00:03<00:12,  3.09it/s]

Searching for pathway between C05904 and C12647


 22%|██▏       | 11/50 [00:04<00:14,  2.70it/s]

Searching for pathway between C05908 and C16303


 24%|██▍       | 12/50 [00:04<00:15,  2.48it/s]

Searching for pathway between C00078 and C07576
***** No path found between C00078 and C07576 *****
Searching for pathway between C06160 and C20299
***** Node not found for C06160 or C20299 *****
Searching for pathway between C05191 and C06165


 30%|███       | 15/50 [00:04<00:08,  4.14it/s]

Searching for pathway between C00079 and C10860


 34%|███▍      | 17/50 [00:05<00:09,  3.55it/s]

Searching for pathway between C00047 and C00408
***** Path with length 2 ['C00047', 'C00408'] *****
Searching for pathway between C07481 and C00385


 36%|███▌      | 18/50 [00:05<00:07,  4.04it/s]

Searching for pathway between C00082 and C08557


 40%|████      | 20/50 [00:06<00:07,  4.17it/s]

Searching for pathway between C08539 and C08542
Searching for pathway between C00441 and C20921


 42%|████▏     | 21/50 [00:06<00:08,  3.32it/s]

Searching for pathway between C01121 and C00413
Searching for pathway between C02627 and C01822


 46%|████▌     | 23/50 [00:07<00:07,  3.57it/s]

Searching for pathway between C00043 and C01759


 48%|████▊     | 24/50 [00:08<00:10,  2.49it/s]

Searching for pathway between C00043 and C21263
***** Node not found for C00043 or C21263 *****
Searching for pathway between C12468 and C05080


 52%|█████▏    | 26/50 [00:08<00:08,  2.85it/s]

Searching for pathway between C08588 and C08589
Searching for pathway between C20953 and C20942
***** Node not found for C20953 or C20942 *****
Searching for pathway between C00002 and C01610


 58%|█████▊    | 29/50 [00:09<00:06,  3.36it/s]

Searching for pathway between C00047 and C05554


 60%|██████    | 30/50 [00:09<00:06,  3.20it/s]

Searching for pathway between C00750 and C00106


 62%|██████▏   | 31/50 [00:10<00:06,  2.96it/s]

Searching for pathway between C00418 and C03461


 64%|██████▍   | 32/50 [00:10<00:06,  2.86it/s]

Searching for pathway between C00341 and C00400


 66%|██████▌   | 33/50 [00:11<00:06,  2.44it/s]

Searching for pathway between C00341 and C20944


 68%|██████▊   | 34/50 [00:11<00:06,  2.52it/s]

Searching for pathway between C11899 and C07394
Searching for pathway between C05413 and C15881


 72%|███████▏  | 36/50 [00:11<00:04,  2.98it/s]

Searching for pathway between C05413 and C15900


 74%|███████▍  | 37/50 [00:12<00:04,  2.99it/s]

Searching for pathway between C01789 and C15791


 76%|███████▌  | 38/50 [00:12<00:03,  3.17it/s]

Searching for pathway between C01126 and C16507


 78%|███████▊  | 39/50 [00:13<00:04,  2.73it/s]

Searching for pathway between C00082 and C05582


 80%|████████  | 40/50 [00:13<00:03,  3.03it/s]

Searching for pathway between C00180 and C00091


 82%|████████▏ | 41/50 [00:14<00:04,  2.23it/s]

Searching for pathway between C09822 and C06715


 84%|████████▍ | 42/50 [00:14<00:03,  2.27it/s]

Searching for pathway between C06789 and C00024


 86%|████████▌ | 43/50 [00:15<00:03,  2.01it/s]

Searching for pathway between C01455 and C00512


 88%|████████▊ | 44/50 [00:15<00:02,  2.10it/s]

Searching for pathway between C06756 and C06730


 90%|█████████ | 45/50 [00:15<00:02,  2.46it/s]

Searching for pathway between C16391 and C16400


 92%|█████████▏| 46/50 [00:15<00:01,  2.86it/s]

Searching for pathway between C07083 and C00186


 94%|█████████▍| 47/50 [00:16<00:01,  2.58it/s]

Searching for pathway between C11249 and C02232


 96%|█████████▌| 48/50 [00:17<00:01,  1.99it/s]

Searching for pathway between C00829 and C00805


100%|██████████| 50/50 [00:17<00:00,  2.84it/s]

Searching for pathway between C07535 and C18283
Correct pathway predictions: 23
Correct pathway predictions (%): 46.0





### Study graph

### Try cluster graph nodes

### Try similarity based on SMILES

In [4]:
smiles1 = data.get_compound_by_id('C00223').smiles
smiles2 = data.get_compound_by_id('C00323').smiles

from rdkit import Chem
from rdkit import DataStructs

ms = [Chem.MolFromSmiles(smiles1), Chem.MolFromSmiles(smiles2)]
fs = [Chem.RDKFingerprint(x) for x in ms]
s = DataStructs.FingerprintSimilarity(fs[0], fs[1])
print('Similarity: '+str(s))


smiles1 = data.get_compound_by_id('C00223').smiles
smiles2 = data.get_compound_by_id('C12096').smiles

from rdkit import Chem
from rdkit import DataStructs

ms = [Chem.MolFromSmiles(smiles1), Chem.MolFromSmiles(smiles2)]
fs = [Chem.RDKFingerprint(x) for x in ms]
s = DataStructs.FingerprintSimilarity(fs[0], fs[1])
print('Similarity: '+str(s))

Similarity: 0.989145183175034
Similarity: 0.5102379634753735
