### Main notebook

In [1]:
# add folder for imports
import sys
sys.path.append('../')

import pandas as pd
import numpy as np
import json
import networkx as nx
from tqdm import tqdm
from compound import Compound
from reaction import Reaction
from graph import Graph
from data import Data
from pathway import Pathway
from utils import create_compound, create_reaction
# suppres rdkit warnings
import rdkit
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')


# read data from csv
cpds = pd.read_csv('../data/compounds_final.csv', index_col=0) # containing toxicity
rxns = pd.read_csv('../data/reactions_final.csv', index_col=0)
pairs = pd.read_csv('../data/pairs_final_RPAIRS_pred.csv', index_col=0)
cofactors = pd.read_csv('../data/original/cofactors_KEGG.csv')

# create class instances
data = Data()
graph = Graph(pairs=pairs)
pathway = Pathway()

data = create_compound(data, cpds, cofactors)
data = create_reaction(data, rxns)

# CREATE GRAPH
graph.create_graph(data=data, pairs=pairs)

graph.calculate_edge_mol_weight(data)
graph.calculate_smiles_similarity(data)

# nodes: 7997 
# edges: 11783


100%|██████████| 7997/7997 [00:17<00:00, 451.15it/s]
100%|██████████| 11783/11783 [00:00<00:00, 260982.56it/s]
100%|██████████| 11783/11783 [00:52<00:00, 226.05it/s]


In [2]:
######### VALIDATION SET FROM nicepath ###########
test_cases = pd.read_csv('../data/original/test_cases.csv')
test_cases['source'] = test_cases['Pathway '].apply(lambda x: x.split(',')[0])
test_cases['target'] = test_cases['Pathway '].apply(lambda x: x.split(',')[len(x.split(','))-1])
test_cases['paths_list'] = test_cases['Pathway '].apply(lambda x: x.split(','))
print(f'Number of test cases: {len(test_cases)}')
      
from utils import validate

paths = validate(test_cases, graph, 'mol_weight')

Number of test cases: 50


 24%|██▍       | 12/50 [00:11<00:37,  1.02it/s]

***** No path found between C00078 and C07576 *****
***** Node not found for C06160 or C20299 *****


 34%|███▍      | 17/50 [00:13<00:20,  1.57it/s]

***** Path with length 2 ['C00047', 'C00408'] *****


 48%|████▊     | 24/50 [00:18<00:23,  1.09it/s]

***** Node not found for C00043 or C21263 *****


 52%|█████▏    | 26/50 [00:20<00:21,  1.14it/s]

***** Node not found for C20953 or C20942 *****


100%|██████████| 50/50 [01:05<00:00,  1.31s/it]

Correct pathway predictions: 23
Correct pathway predictions (%): 46.0





### Search for correct pathways in top-k predicted paths

In [3]:
import ast

def validate(test_cases: pd.DataFrame, G: Graph, method: str, i: int):
    correct_pathways = []
    paths = []
    for row in tqdm(range(len(test_cases))):
        source = test_cases['source'].iloc[row]
        target = test_cases['target'].iloc[row]
        try:
            pred_path, idx_smi, idx_com = G.constrained_shortest_path(source, target, weight=method)
            pred_path = pred_path[idx_smi[i]]
        except nx.NodeNotFound:
            print(f'***** Node not found for {source} or {target} *****')
            pred_path, idx_smi, idx_com = [], None, None
        except TypeError:
            pass
        except IndexError:
            pass
        
        correct_pathways.append((pred_path == test_cases['paths_list'].iloc[row]))
        paths.append(pred_path)
    
    print(f'Correct pathway predictions: {correct_pathways.count(True)}')
    print(f'Correct pathway predictions (%): {100 * correct_pathways.count(True) / len(correct_pathways)}')

    # return the DataFrame with the resulted pathways and correct or not
    paths = pd.DataFrame([str(p) for p in paths], columns=['Pathway'])
    paths['Pathway']  = paths['Pathway'].apply(lambda x: ast.literal_eval(x))
    paths['Correct'] = correct_pathways
    return paths

def search_for_correct_in_top_k_paths(idx, graph, i: int):
    test_cases = pd.read_csv('../data/original/test_cases.csv')
    test_cases['source'] = test_cases['Pathway '].apply(lambda x: x.split(',')[0])
    test_cases['target'] = test_cases['Pathway '].apply(lambda x: x.split(',')[len(x.split(','))-1])
    test_cases['paths_list'] = test_cases['Pathway '].apply(lambda x: x.split(','))
    
    test_cases = test_cases.iloc[idx]
    print(f'Shape of wrong predictions: {test_cases.shape}')
    paths = validate(test_cases, graph, 'mol_weight', i)

    return paths

for i in range(1, 9):
    print(f'Search in top {i+1} paths\n')
    idx = paths[paths['Correct'] == False].index
    paths = search_for_correct_in_top_k_paths(idx, graph, i)
    print()

Search in top 2 paths

Shape of wrong predictions: (27, 4)


 11%|█         | 3/27 [00:07<01:01,  2.58s/it]

***** No path found between C00078 and C07576 *****
***** Node not found for C06160 or C20299 *****


 26%|██▌       | 7/27 [00:11<00:28,  1.43s/it]

***** Path with length 2 ['C00047', 'C00408'] *****


 37%|███▋      | 10/27 [00:19<00:38,  2.24s/it]

***** Node not found for C00043 or C21263 *****
***** Node not found for C20953 or C20942 *****


100%|██████████| 27/27 [00:43<00:00,  1.60s/it]


Correct pathway predictions: 6
Correct pathway predictions (%): 22.22222222222222

Search in top 3 paths

Shape of wrong predictions: (21, 4)


 52%|█████▏    | 11/21 [00:08<00:07,  1.27it/s]

***** Node not found for C06160 or C20299 *****


100%|██████████| 21/21 [00:15<00:00,  1.39it/s]


Correct pathway predictions: 2
Correct pathway predictions (%): 9.523809523809524

Search in top 4 paths

Shape of wrong predictions: (19, 4)


 53%|█████▎    | 10/19 [00:07<00:06,  1.33it/s]

***** No path found between C00078 and C07576 *****
***** Node not found for C06160 or C20299 *****


 79%|███████▉  | 15/19 [00:10<00:02,  1.77it/s]

***** Path with length 2 ['C00047', 'C00408'] *****


100%|██████████| 19/19 [00:12<00:00,  1.54it/s]


Correct pathway predictions: 0
Correct pathway predictions (%): 0.0

Search in top 5 paths

Shape of wrong predictions: (19, 4)


 63%|██████▎   | 12/19 [00:09<00:05,  1.22it/s]

***** No path found between C00078 and C07576 *****
***** Node not found for C06160 or C20299 *****


 89%|████████▉ | 17/19 [00:11<00:01,  1.76it/s]

***** Path with length 2 ['C00047', 'C00408'] *****


100%|██████████| 19/19 [00:12<00:00,  1.53it/s]


Correct pathway predictions: 0
Correct pathway predictions (%): 0.0

Search in top 6 paths

Shape of wrong predictions: (19, 4)


 63%|██████▎   | 12/19 [00:09<00:05,  1.24it/s]

***** No path found between C00078 and C07576 *****
***** Node not found for C06160 or C20299 *****


 89%|████████▉ | 17/19 [00:11<00:01,  1.66it/s]

***** Path with length 2 ['C00047', 'C00408'] *****


100%|██████████| 19/19 [00:12<00:00,  1.52it/s]


Correct pathway predictions: 0
Correct pathway predictions (%): 0.0

Search in top 7 paths

Shape of wrong predictions: (19, 4)


 63%|██████▎   | 12/19 [00:09<00:06,  1.13it/s]

***** No path found between C00078 and C07576 *****
***** Node not found for C06160 or C20299 *****


 89%|████████▉ | 17/19 [00:12<00:01,  1.62it/s]

***** Path with length 2 ['C00047', 'C00408'] *****


100%|██████████| 19/19 [00:12<00:00,  1.46it/s]


Correct pathway predictions: 0
Correct pathway predictions (%): 0.0

Search in top 8 paths

Shape of wrong predictions: (19, 4)


 63%|██████▎   | 12/19 [00:12<00:06,  1.01it/s]

***** No path found between C00078 and C07576 *****
***** Node not found for C06160 or C20299 *****


 89%|████████▉ | 17/19 [00:15<00:01,  1.29it/s]

***** Path with length 2 ['C00047', 'C00408'] *****


100%|██████████| 19/19 [00:16<00:00,  1.15it/s]


Correct pathway predictions: 0
Correct pathway predictions (%): 0.0

Search in top 9 paths

Shape of wrong predictions: (19, 4)


 63%|██████▎   | 12/19 [00:10<00:06,  1.01it/s]

***** No path found between C00078 and C07576 *****
***** Node not found for C06160 or C20299 *****


 89%|████████▉ | 17/19 [00:13<00:01,  1.42it/s]

***** Path with length 2 ['C00047', 'C00408'] *****


100%|██████████| 19/19 [00:14<00:00,  1.31it/s]

Correct pathway predictions: 0
Correct pathway predictions (%): 0.0






In [4]:
print(f'Correct pathway predictions: {(50-19)/50 * 100}%')

Correct pathway predictions: 62.0%
