### Main notebook

In [2]:
import pandas as pd
import numpy as np
import json
import networkx as nx
from tqdm import tqdm
from compound import Compound
from reaction import Reaction
from graph import Graph
from data import Data

# suppres rdkit warnings
import rdkit
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')

# read data from csv
cpds = pd.read_csv('../GNN_toxic/data/raw/compounds_final.csv', index_col=0) # containing toxicity
rxns = pd.read_csv('data/reactions_final.csv', index_col=0)
pairs = pd.read_csv('data/pairs_final.csv', index_col=0)
cofactors = pd.read_csv('data/original/cofactors_KEGG.csv')

# create class instances
data = Data()
graph = Graph(pairs=pairs)

# Create a Compound object for each row in the DataFrame and add it to the data
for index, row in cpds.iterrows():
    entry = row['Entry']
    name = row['Names']
    formula = row['Formula']
    mw = row['mol_weight']
    smiles = row['SMILES']
    is_cofactor = row['Entry'] in cofactors['Entry'].values
    is_toxic = row['toxic']

    compound = Compound(entry, name, formula, mw, smiles, is_cofactor, is_toxic)
    data.add_element('compound', compound)

# Create a Reaction object for each row in the DataFrame and add it to the data
for index, row in rxns.iterrows():
    entry = row['Entry']
    name = row['Names']
    compounds = row['Compound']
    enzyme = row['EC Number']

    reaction = Reaction(entry, name, compounds, enzyme)
    data.add_element('reaction', reaction)

In [3]:
# CREATE GRAPH
graph.create_graph(data=data, pairs=pairs)

graph.calculate_edge_mol_weight(data)
graph.calculate_smiles_similarity(data)

# nodes: 8591 
# edges: 30026


100%|██████████| 8591/8591 [00:14<00:00, 580.57it/s]
100%|██████████| 30026/30026 [00:00<00:00, 218319.35it/s]
100%|██████████| 30026/30026 [00:49<00:00, 604.14it/s] 


In [4]:
######### VALIDATION SET FROM nicepath ###########
test_cases = pd.read_csv('data/original/test_cases.csv')
test_cases['source'] = test_cases['Pathway '].apply(lambda x: x.split(',')[0])
test_cases['target'] = test_cases['Pathway '].apply(lambda x: x.split(',')[len(x.split(','))-1])
test_cases['paths_list'] = test_cases['Pathway '].apply(lambda x: x.split(','))

''' Get just a single pathway from test_cases for testing '''
test_cases = test_cases.iloc[0:1]

paths = graph.validate(test_cases, 'mol_weight')

  0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:02<00:00,  2.19s/it]

Correct pathway predictions: 1
Correct pathway predictions (%): 100.0





### Display pathway to BIGG style

In [9]:
''' Get the pathway from paths '''
path = paths['Pathway'].iloc[0]
print(path)

['C00082', 'C00811', 'C01197', 'C01494', 'C05619', 'C00482', 'C05610', 'C02325', 'C01533']


In [13]:
for cpd in path:
    bigg_cpd = graph.kegg_to_bigg_compound(cpd)        
    print(cpd, bigg_cpd)


C00082 tyr__L
C00811 T4hcinnm
C01197 34dhcinm
C01494 fer
C05619 Not found
C00482 Not found
C05610 Not found
C02325 Not found
C01533 Not found


In [None]:
print(test_cases.iloc[20]['Pathway '])
print(test_cases.iloc[42]['Pathway '])
print(test_cases.iloc[47]['Pathway '])

In [None]:
kegg_cpd = 'C00084'
bigg_cpd = graph.kegg_to_bigg_compound(kegg_cpd)
print(bigg_cpd)

kegg_rxn = 'R00084'
bigg_rxn = graph.kegg_to_bigg_compound(kegg_rxn)
print(bigg_rxn)

In [None]:
1/0

In [13]:
# p, idx, idxcom = graph.constrained_shortest_path('C00082', 'C01533', weight='mol_weight')
for i in p: 
    print(i)
    compound_list = i
    for j in range(len(compound_list)-1):
        cpd_a = compound_list[j]
        cpd_b = compound_list[j+1]
        ''' get pairs with source=cpd_a and target=cpd_b '''
        pairs = graph.pairs[(graph.pairs['source'] == cpd_a) & (graph.pairs['target'] == cpd_b)]
        ''' if len(pairs)=0 get pairs with source = cpd_b and target = cpd_a '''
        if len(pairs) == 0:
            pairs = graph.pairs[(graph.pairs['source'] == cpd_b) & (graph.pairs['target'] == cpd_a)]
        print(len(pairs))
    break

['C00082', 'C00355', 'C10447', 'C01197', 'C01494', 'C05619', 'C00482', 'C05610', 'C02325', 'C01533']
7
1
1
1
1
1
2
1
1


### Study graph

### Try cluster graph nodes

### Try similarity based on SMILES

In [None]:
smiles1 = data.get_compound_by_id('C00223').smiles
smiles2 = data.get_compound_by_id('C00323').smiles

from rdkit import Chem
from rdkit import DataStructs

ms = [Chem.MolFromSmiles(smiles1), Chem.MolFromSmiles(smiles2)]
fs = [Chem.RDKFingerprint(x) for x in ms]
s = DataStructs.FingerprintSimilarity(fs[0], fs[1])
print('Similarity: '+str(s))


smiles1 = data.get_compound_by_id('C00223').smiles
smiles2 = data.get_compound_by_id('C12096').smiles

from rdkit import Chem
from rdkit import DataStructs

ms = [Chem.MolFromSmiles(smiles1), Chem.MolFromSmiles(smiles2)]
fs = [Chem.RDKFingerprint(x) for x in ms]
s = DataStructs.FingerprintSimilarity(fs[0], fs[1])
print('Similarity: '+str(s))