### Main notebook

In [1]:
import pandas as pd
import numpy as np
import json
import networkx as nx
from tqdm import tqdm
from compound import Compound
from reaction import Reaction
from graph import Graph
from data import Data

# read data from csv
cpds = pd.read_csv('../GNN_toxic/data/raw/compounds_final.csv', index_col=0) # containing toxicity
rxns = pd.read_csv('data/reactions_final.csv', index_col=0)
pairs = pd.read_csv('data/pairs_final.csv', index_col=0)
cofactors = pd.read_csv('data/original/cofactors_KEGG.csv')

# create class instances
data = Data()
graph = Graph(pairs=pairs)

In [2]:
# Create a Compound object for each row in the DataFrame and add it to the data
for index, row in cpds.iterrows():
    entry = row['Entry']
    name = row['Names']
    formula = row['Formula']
    mw = row['mol_weight']
    smiles = row['SMILES']
    is_cofactor = row['Entry'] in cofactors['Entry'].values
    is_toxic = row['toxic']

    compound = Compound(entry, name, formula, mw, smiles, is_cofactor, is_toxic)
    data.add_element('compound', compound)

# Create a Reaction object for each row in the DataFrame and add it to the data
for index, row in rxns.iterrows():
    entry = row['Entry']
    name = row['Names']
    compounds = row['Compound']
    enzyme = row['EC Number']

    reaction = Reaction(entry, name, compounds, enzyme)
    data.add_element('reaction', reaction)

In [3]:
graph.create_graph(data=data, pairs=pairs)

# nodes: 8591 
# edges: 30026


100%|██████████| 8591/8591 [00:01<00:00, 5099.08it/s]


In [None]:
graph.calculate_edge_mol_weight(data)
graph.calculate_smiles_similarity(data)

  0%|          | 0/30026 [00:00<?, ?it/s]

100%|██████████| 30026/30026 [00:00<00:00, 215521.71it/s]
100%|██████████| 30026/30026 [00:55<00:00, 539.20it/s]


In [None]:
######### VALIDATION SET FROM nicepath ###########
test_cases = pd.read_csv('data/original/test_cases.csv')
test_cases['source'] = test_cases['Pathway '].apply(lambda x: x.split(',')[0])
test_cases['target'] = test_cases['Pathway '].apply(lambda x: x.split(',')[len(x.split(','))-1])
test_cases['paths_list'] = test_cases['Pathway '].apply(lambda x: x.split(','))
paths = graph.validate(test_cases, 'mol_weight')

  0%|          | 0/50 [00:00<?, ?it/s]

100%|██████████| 50/50 [01:18<00:00,  1.56s/it]

Correct pathway predictions: 19
Correct pathway predictions (%): 38.0





### Study graph

In [None]:
graph.num_occurences

Unnamed: 0_level_0,0
0,Unnamed: 1_level_1
C00001,9968
C00080,6569
C00007,5235
C00006,3501
C00003,3248
...,...
C21281,1
C21282,1
C05571,1
C02200,1


In [None]:
from networkx.algorithms.community import greedy_modularity_communities

G = graph.G

# communities = greedy_modularity_communities(G, weight="smiles_similarity") # added in similarity threshold as a weight

In [None]:
print(len(communities))
# save a frozen set
import pickle
with open('./data/communities.pkl', 'wb') as f:
    pickle.dump(communities, f)

83


In [18]:
import pickle

# load pkl file
with open('./data/communities.pkl', 'rb') as f:
    communities = pickle.load(f)
    
# Create a list of dictionaries directly
data = [{'community': idx, 'compound': cpd} for idx, community in enumerate(communities) for cpd in community]

# Create the DataFrame
df = pd.DataFrame(data)
df

Unnamed: 0,community,compound
0,0,C01149
1,0,C04468
2,0,C05917
3,0,C06523
4,0,C20528
...,...,...
8586,80,C22078
8587,81,C20774
8588,81,C22270
8589,82,C22246


### Try cluster graph nodes

### Try similarity based on SMILES

In [None]:
smiles1 = data.get_compound_by_id('C00223').smiles
smiles2 = data.get_compound_by_id('C00323').smiles

from rdkit import Chem
from rdkit import DataStructs

ms = [Chem.MolFromSmiles(smiles1), Chem.MolFromSmiles(smiles2)]
fs = [Chem.RDKFingerprint(x) for x in ms]
s = DataStructs.FingerprintSimilarity(fs[0], fs[1])
print('Similarity: '+str(s))


smiles1 = data.get_compound_by_id('C00223').smiles
smiles2 = data.get_compound_by_id('C12096').smiles

from rdkit import Chem
from rdkit import DataStructs

ms = [Chem.MolFromSmiles(smiles1), Chem.MolFromSmiles(smiles2)]
fs = [Chem.RDKFingerprint(x) for x in ms]
s = DataStructs.FingerprintSimilarity(fs[0], fs[1])
print('Similarity: '+str(s))