### Main notebook

In [1]:
import pandas as pd
import numpy as np
import json
import networkx as nx
from tqdm import tqdm
from compound import Compound
from reaction import Reaction
from graph import Graph
from data import Data

# read data from csv
cpds = pd.read_csv('../GNN_toxic/data/raw/compounds_final.csv', index_col=0) # containing toxicity
rxns = pd.read_csv('data/reactions_final.csv', index_col=0)
pairs = pd.read_csv('data/pairs_final.csv', index_col=0)
cofactors = pd.read_csv('data/original/cofactors_KEGG.csv')

# create class instances
data = Data()
graph = Graph(pairs=pairs)

# Create a Compound object for each row in the DataFrame and add it to the data
for index, row in cpds.iterrows():
    entry = row['Entry']
    name = row['Names']
    formula = row['Formula']
    mw = row['mol_weight']
    smiles = row['SMILES']
    is_cofactor = row['Entry'] in cofactors['Entry'].values
    is_toxic = row['toxic']

    compound = Compound(entry, name, formula, mw, smiles, is_cofactor, is_toxic)
    data.add_element('compound', compound)

# Create a Reaction object for each row in the DataFrame and add it to the data
for index, row in rxns.iterrows():
    entry = row['Entry']
    name = row['Names']
    compounds = row['Compound']
    enzyme = row['EC Number']

    reaction = Reaction(entry, name, compounds, enzyme)
    data.add_element('reaction', reaction)

In [2]:
# CREATE GRAPH
graph.create_graph(data=data, pairs=pairs)

graph.calculate_edge_mol_weight(data)
graph.calculate_smiles_similarity(data)

# nodes: 8591 
# edges: 30026


100%|██████████| 8591/8591 [00:11<00:00, 717.39it/s] 
100%|██████████| 30026/30026 [00:00<00:00, 358673.54it/s]
100%|██████████| 30026/30026 [00:46<00:00, 645.30it/s]


In [3]:
######### VALIDATION SET FROM nicepath ###########
test_cases = pd.read_csv('data/original/test_cases.csv')
test_cases['source'] = test_cases['Pathway '].apply(lambda x: x.split(',')[0])
test_cases['target'] = test_cases['Pathway '].apply(lambda x: x.split(',')[len(x.split(','))-1])
test_cases['paths_list'] = test_cases['Pathway '].apply(lambda x: x.split(','))

paths = graph.validate(test_cases, 'mol_weight')

  0%|          | 0/50 [00:00<?, ?it/s]

 34%|███▍      | 17/50 [00:21<00:40,  1.22s/it]

Path with length 2 ['C00047', 'C00408']


100%|██████████| 50/50 [01:19<00:00,  1.60s/it]

Correct pathway predictions: 19
Correct pathway predictions (%): 38.0





In [11]:
print(test_cases.iloc[20]['Pathway '])
print(test_cases.iloc[42]['Pathway '])
print(test_cases.iloc[47]['Pathway '])

C00441,C20258,C03972,C05539,C20911,C20912,C20913,C20914,C20915,C20916,C20917,C20918,C20921
C06789,C06790,C06791,C06793,C06547,C06548,C00024
C11249,C00854,C00414,C01880,C06103,C06102,C06104,C14143,C14144,C14145,C02232


In [6]:
paths

Unnamed: 0,Pathway,Correct
0,"[C00082, C00811, C01197, C01494, C05619, C0048...",True
1,"[C00223, C12096, C00029, C00761]",False
2,"[C00811, C01197, C01494, C05619, C00482, C0117...",True
3,"[C00079, C00423, C00540, C00903]",True
4,"[C00223, C12096, C00029, C17750]",False
5,"[C06561, C00509, C16492, C01460]",True
6,"[C05903, C12249, C12634, C12635, C12636]",True
7,"[C01477, C01514, C03515, C12632, C04900]",True
8,"[C05903, C00389, C10107, C12633, C11620]",True
9,"[C05905, C08604, C12095, C12096, C16299]",False


In [4]:
1/0

ZeroDivisionError: division by zero

In [None]:
p, idx, idxcom = graph.constrained_shortest_path('C00082', 'C01533', weight='mol_weight')


In [None]:
for i in p: print(i)

['C00082', 'C00355', 'C10447', 'C01197', 'C01494', 'C05619', 'C00482', 'C05610', 'C02325', 'C01533']
['C00082', 'C00822', 'C05604', 'C00355', 'C10447', 'C01197', 'C01494', 'C05619', 'C00482', 'C05610', 'C02325', 'C01533']
['C00082', 'C00811', 'C01197', 'C01494', 'C05619', 'C00482', 'C05610', 'C02325', 'C01533']
['C00082', 'C00355', 'C04045', 'C22038', 'C10447', 'C01197', 'C01494', 'C05619', 'C00482', 'C05610', 'C02325', 'C01533']
['C00082', 'C00822', 'C05604', 'C00355', 'C04045', 'C22038', 'C10447', 'C01197', 'C01494', 'C05619', 'C00482', 'C05610', 'C02325', 'C01533']
['C00082', 'C00355', 'C10447', 'C01197', 'C01494', 'C02666', 'C12204', 'C05610', 'C02325', 'C01533']


In [None]:
idxcom

5

In [None]:
# C00047,C04076,C00450,C00408

### Study graph

In [None]:
''' Read test_cases csv file'''
test_cases = pd.read_csv('data/original/test_cases.csv')
test_cases['source'] = test_cases['Pathway '].apply(lambda x: x.split(',')[0])
test_cases['target'] = test_cases['Pathway '].apply(lambda x: x.split(',')[len(x.split(','))-1])
test_cases['paths_list'] = test_cases['Pathway '].apply(lambda x: x.split(','))
test_cases.iloc[0]

Pathway       C00082,C00811,C01197,C01494,C05619,C00482,C056...
source                                                   C00082
target                                                   C01533
paths_list    [C00082, C00811, C01197, C01494, C05619, C0048...
Name: 0, dtype: object

In [None]:
paths[~paths['Correct']]

Unnamed: 0,Pathway,Correct
1,"[C00223, C12096, C00029, C00761]",False
4,"[C00223, C12096, C00029, C17750]",False
9,"[C05905, C08604, C12095, C12096, C16299]",False
10,"[C05904, C12137, C12640, C12641, C12647]",False
11,"[C05908, C12138, C16301, C16303]",False
12,"[C00078, C00398, C21762, C21778, C07576]",False
13,"[C06160, C06161, C06517, C06520, C02105, C0210...",False
15,"[C00079, C01456, C02046, C10860]",False
16,"[C00047, C00408]",False
17,"[C07481, C07130, C00385]",False


### Try cluster graph nodes

### Try similarity based on SMILES

In [None]:
smiles1 = data.get_compound_by_id('C00223').smiles
smiles2 = data.get_compound_by_id('C00323').smiles

from rdkit import Chem
from rdkit import DataStructs

ms = [Chem.MolFromSmiles(smiles1), Chem.MolFromSmiles(smiles2)]
fs = [Chem.RDKFingerprint(x) for x in ms]
s = DataStructs.FingerprintSimilarity(fs[0], fs[1])
print('Similarity: '+str(s))


smiles1 = data.get_compound_by_id('C00223').smiles
smiles2 = data.get_compound_by_id('C12096').smiles

from rdkit import Chem
from rdkit import DataStructs

ms = [Chem.MolFromSmiles(smiles1), Chem.MolFromSmiles(smiles2)]
fs = [Chem.RDKFingerprint(x) for x in ms]
s = DataStructs.FingerprintSimilarity(fs[0], fs[1])
print('Similarity: '+str(s))

Similarity: 0.989145183175034
Similarity: 0.5102379634753735
