In [6]:
import pandas as pd
import numpy as np
import os

df = pd.read_csv('./data/raw/compounds_final.csv', index_col=0)
df.head(1)

data = pd.DataFrame(df['SMILES'])
data.rename(columns={'SMILES': 'smiles'}, inplace=True)
data['label'] = np.nan
data.head()

# check if dir exists and create if not
if not os.path.exists('./data/kegg_data/raw'):
    os.makedirs('./data/kegg_data/raw')

if not os.path.exists('./data/kegg_data/processed'):
    os.makedirs('./data/kegg_data/processed')

# save data to csv
data.to_csv('./data/kegg_data/raw/data.csv', index=False)

In [7]:
from dataset import MoleculeDataset

kegg_dataset = MoleculeDataset(root='./data/kegg_data/', filename='data.csv', test=True)
print(f'Length of dataset: {len(kegg_dataset)}')

Processing...
  3%|▎         | 228/8591 [00:00<00:10, 794.00it/s]

100%|██████████| 8591/8591 [00:08<00:00, 970.84it/s] 

Length of dataset: 8591



Done!


In [31]:
from train import load_torch_model, predict

model = load_torch_model(feature_size=kegg_dataset[0].x.shape[1])

y = []
for i in range(len(kegg_dataset)):
        
    y_true, y_pred = predict(model, kegg_dataset[i])
    y.append(y_pred)




In [32]:
df['toxic']=y
df[df['toxic']==1]

Unnamed: 0,Entry,Names,Formula,Exact Mass,Mol Weight,SMILES,Local Total In/Out-Degree(0.1 - 1.0),Reactions,ChEBI,PubChem,...,Br,Hg,C,Cl,O,I,F,polymer,mol_weight,toxic
69,C00078,"[""L-Tryptophan"", ""Tryptophan"", ""(S)-alpha-Amin...",C11H12N2O2,204.0899,204.2252,N[C@@H](Cc1c[nH]c2ccccc12)C(=O)O,"[[94, 94], [100, 100], [102, 102], [294, 294],...","[""R00673"", ""R00674"", ""R00675"", ""R00676"", ""R006...","[""16828""]","[""3378""]",...,0,0,11,0,2,0,0,0,204.226,[1]
81,C00090,"[""Catechol"", ""1,2-Benzenediol"", ""o-Benzenediol...",C6H6O2,110.0368,110.1106,Oc1ccccc1O,"[[100, 100], [106, 106], [109, 109], [109, 109...","[""R00058"", ""R00080"", ""R00812"", ""R00813"", ""R008...","[""18135""]","[""3390""]",...,0,0,6,0,2,0,0,0,110.108,[1]
95,C00106,"[""Uracil""]",C4H4N2O2,112.0273,112.0868,O=c1cc[nH]c(=O)[nH]1,"[[14, 14], [25, 25], [32, 32], [34, 34], [34, ...","[""R00966"", ""R00973"", ""R00974"", ""R00975"", ""R009...","[""17568""]","[""3406""]",...,0,0,4,0,2,0,0,0,112.092,[1]
96,C00108,"[""Anthranilate"", ""Anthranilic acid"", ""o-Aminob...",C7H7NO2,137.0477,137.136,Nc1ccccc1C(=O)O,"[[82, 82], [87, 87], [95, 95], [96, 96], [96, ...","[""R00823"", ""R00825"", ""R00980"", ""R00982"", ""R009...","[""16567"", ""30754""]","[""3408""]",...,0,0,7,0,2,0,0,0,137.136,[1]
129,C00143,"[""5,10-Methylenetetrahydrofolate"", ""(6R)-5,10-...",C20H23N7O6,457.171,457.4399,Nc1nc2c(c(=O)[nH]1)N1CN(c3ccc(C(=O)NC(CCC(=O)O...,"[[66, 66], [66, 66], [67, 67], [67, 67], [67, ...","[""R00945"", ""R01217"", ""R01218"", ""R01220"", ""R012...","[""1989""]","[""3443""]",...,0,0,20,0,6,0,0,0,457.454,[1]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8575,C22317,"[""2-Heptyl-1-methoxyquinolin-4(1H)-one"", ""HMOQ""]",C17H23NO2,273.1729,273.37,CCCCCCCc1cc(=O)c2ccccc2n1OC,"[[0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0...","[""R12667""]",[],[],...,0,0,17,0,2,0,0,0,273.364,[1]
8577,C22319,"[""1-Tuberculosinyladenosine""]",C30H46N5O4,540.355,540.7173,C/C(=C\C[n+]1cnc2c(ncn2[C@@H]2O[C@H](CO)[C@@H]...,"[[1, 1], [1, 1], [12, 12], [12, 12], [12, 12],...","[""R12669""]",[],[],...,0,0,30,0,4,0,0,0,540.718,[1]
8579,C22321,"[""N,N-Dimethyl-N'-phenylurea compound""]",C9H10N2OR2,"""""","""""",*c1ccc(NC(=O)N(C)C)cc1*,"[[0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0...","[""R12673""]",[],[],...,0,0,9,0,1,0,0,0,162.190,[1]
8580,C22322,"[""N-Methyl-N'-phenylurea compound""]",C8H8N2OR2,"""""","""""",*c1ccc(NC(=O)NC)cc1*,"[[0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0...","[""R12673""]",[],[],...,0,0,8,0,1,0,0,0,148.164,[1]
