### Main notebook

In [1]:
# add folder for imports
import sys
sys.path.append('../')

import pandas as pd
import numpy as np
import json
import networkx as nx
from tqdm import tqdm
from compound import Compound
from reaction import Reaction
from graph import Graph
from data import Data
from pathway import Pathway
from utils import create_compound, create_reaction
# suppres rdkit warnings
import rdkit
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')


# read data from csv
cpds = pd.read_csv('../data/compounds_final.csv', index_col=0) # containing toxicity
rxns = pd.read_csv('../data/reactions_final.csv', index_col=0)
pairs = pd.read_csv('../data/pairs_final_RPAIRS_pred.csv', index_col=0)
cofactors = pd.read_csv('../data/original/cofactors_KEGG.csv')

# create class instances
data = Data()
graph = Graph(pairs=pairs)
pathway = Pathway()

data = create_compound(data, cpds, cofactors)
data = create_reaction(data, rxns)

# CREATE GRAPH
graph.create_graph(data=data, pairs=pairs)

graph.calculate_edge_mol_weight(data)
graph.calculate_smiles_similarity(data)

In [3]:
######### VALIDATION SET FROM nicepath ###########
test_cases = pd.read_csv('../data/original/test_cases.csv')
test_cases['source'] = test_cases['Pathway '].apply(lambda x: x.split(',')[0])
test_cases['target'] = test_cases['Pathway '].apply(lambda x: x.split(',')[len(x.split(','))-1])
test_cases['paths_list'] = test_cases['Pathway '].apply(lambda x: x.split(','))

''' Get just a single pathway from test_cases for testing '''
# test_cases = test_cases.iloc[11:1]

from utils import validate

paths = validate(test_cases, graph, 'mol_weight')

  0%|          | 0/50 [00:00<?, ?it/s]

 24%|██▍       | 12/50 [00:04<00:15,  2.47it/s]

***** No path found between C00078 and C07576 *****
***** Node not found for C06160 or C20299 *****


 36%|███▌      | 18/50 [00:05<00:07,  4.02it/s]

***** Path with length 2 ['C00047', 'C00408'] *****


 48%|████▊     | 24/50 [00:08<00:10,  2.47it/s]

***** Node not found for C00043 or C21263 *****


 52%|█████▏    | 26/50 [00:08<00:08,  2.83it/s]

***** Node not found for C20953 or C20942 *****


100%|██████████| 50/50 [00:17<00:00,  2.84it/s]

Correct pathway predictions: 23
Correct pathway predictions (%): 46.0





### Study graph

### Try cluster graph nodes

### Try similarity based on SMILES

In [4]:
smiles1 = data.get_compound_by_id('C00223').smiles
smiles2 = data.get_compound_by_id('C00323').smiles

from rdkit import Chem
from rdkit import DataStructs

ms = [Chem.MolFromSmiles(smiles1), Chem.MolFromSmiles(smiles2)]
fs = [Chem.RDKFingerprint(x) for x in ms]
s = DataStructs.FingerprintSimilarity(fs[0], fs[1])
print('Similarity: '+str(s))


smiles1 = data.get_compound_by_id('C00223').smiles
smiles2 = data.get_compound_by_id('C12096').smiles

from rdkit import Chem
from rdkit import DataStructs

ms = [Chem.MolFromSmiles(smiles1), Chem.MolFromSmiles(smiles2)]
fs = [Chem.RDKFingerprint(x) for x in ms]
s = DataStructs.FingerprintSimilarity(fs[0], fs[1])
print('Similarity: '+str(s))

Similarity: 0.989145183175034
Similarity: 0.5102379634753735
