To run predictions, you first need to place the trained model file (.ckpt format) into the current directory

In [1]:
import pytorch_lightning as pl
import pandas as pd
import torch.nn as nn
import torch
from chython import smiles
from chytorch.utils.data import MoleculeDataset, chained_collate, collate_molecules
from torch.utils.data import DataLoader, TensorDataset
from model_p import Model, early_stop_callback, FeatureExtractorFreezeUnfreeze
import numpy as np
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from rdkit import Chem

In [2]:
def prepare_mol(mol_smi):
    try:
        mol = smiles(mol_smi)
        try:
            mol.kekule()
        except InvalidAromaticRing:
            mol = None
    except Exception:
        mol = None
    return mol

In [3]:
def read_smiles_from_sdf(sdf_path):
    suppl = Chem.SDMolSupplier(sdf_path)
    smiles_set = set()
    for mol in suppl:
        if mol is not None:
            smi = Chem.MolToSmiles(mol, isomericSmiles=True)
            smiles_set.add(smi)
    return smiles_set

def normalize_smiles(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        return Chem.MolToSmiles(mol, isomericSmiles=True)
    return None

In [4]:
df = pd.read_csv('SGNN_dataset.csv')
df = df[['molecule', 'logp', 'dataset_type']]
df = df[df['dataset_type']=='test']

In [5]:
df

Unnamed: 0,molecule,logp,dataset_type
11635,c1cc2ccc3ccc4ccc5ccc6ccc1c1c2c3c4c5c61,7.64,test
11636,CCC1(CC)C(=O)NC(=S)NC1=O,1.50,test
11637,COC(=O)C(C#N)=NNc1ccc(Cl)c(C(F)F)c1,3.80,test
11638,CC(=O)C=Cc1ccccc1,2.07,test
11639,Cc1ccc2cccc(O)c2n1,2.33,test
...,...,...,...
13683,N#CNc1ccccc1,1.87,test
13684,NS(=O)(=O)c1ccc(S(=O)(=O)CCO)s1,-0.54,test
13685,CCCCC(NC(Cc1ccccc1)C(=O)N1CCC(OCOC)CC1)C(=O)NC...,3.50,test
13686,O=C(COc1ccc2ccccc2c1)N1CCN(S(=O)(=O)c2ccc3c(c2...,4.47,test


In [6]:
X_test = df['molecule']
X_test = X_test.apply(prepare_mol)
y_test = df['logp']

In [7]:
y_test

11635    7.64
11636    1.50
11637    3.80
11638    2.07
11639    2.33
         ... 
13683    1.87
13684   -0.54
13685    3.50
13686    4.47
13687    0.57
Name: logp, Length: 2053, dtype: float64

In [8]:
mols = X_test.to_list()
X_test = MoleculeDataset(mols)
y_test  = torch.Tensor(y_test.to_numpy())

In [9]:
X_test

<chytorch.utils.data.molecule.encoder.MoleculeDataset at 0x7fe7f403a680>

In [10]:
y_test.shape

torch.Size([2053])

In [11]:
seed = 42
model = Model.load_from_checkpoint("GraphormerLogP_SGNN_train_42.ckpt", loss_function=nn.HuberLoss(), learning_rate=4e-4)

In [12]:
test = DataLoader(
            dataset=TensorDataset(X_test, y_test),
            collate_fn=chained_collate(collate_molecules, torch.stack),
            batch_size=32,
            shuffle=False, num_workers=7)


lr_monitor = pl.callbacks.LearningRateMonitor(logging_interval='epoch')
trainer = pl.Trainer(accelerator='gpu', devices=1, min_epochs=50, max_epochs=500,
                         callbacks=[FeatureExtractorFreezeUnfreeze(),
                                    early_stop_callback, lr_monitor],
                         check_val_every_n_epoch=5, log_every_n_steps=38)
temp = trainer.predict(model, test)
y_pred = []
y_uncert = []
for batch in temp:
    preds, uncerts = batch
    y_pred.extend(preds.squeeze().tolist())
    y_uncert.extend(uncerts.squeeze().tolist())

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/home/slavick/miniconda3/envs/logP_env/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:75: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
Missing logger folder: /home/slavick/Test_GLP/GraphormerLogP/SGNN_dataset/lightning_logs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting DataLoader 0: 100%|██████████| 65/65 [00:02<00:00, 22.24it/s]


In [13]:
y_pred = torch.Tensor(np.array(y_pred))
y_uncert = torch.Tensor(np.array(y_uncert))

In [14]:
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
round(rmse, 3), round(r2, 3), round(mae, 3)

(0.449, 0.943, 0.269)

In [15]:
y_pred.cpu().detach().numpy()
y_uncert.cpu().detach().numpy()
df[f'pred_logp_{seed}'] = y_pred
df[f'pred_uncert_{seed}'] = np.sqrt(np.exp(y_uncert))

In [16]:
df.to_csv('Seeds_GLP_PREDICT_SGNN_train_SGNN_test.csv', index=False)
df

Unnamed: 0,molecule,logp,dataset_type,pred_logp_42,pred_uncert_42
11635,c1cc2ccc3ccc4ccc5ccc6ccc1c1c2c3c4c5c61,7.64,test,7.816697,0.422802
11636,CCC1(CC)C(=O)NC(=S)NC1=O,1.50,test,1.432274,0.086676
11637,COC(=O)C(C#N)=NNc1ccc(Cl)c(C(F)F)c1,3.80,test,4.633056,0.213877
11638,CC(=O)C=Cc1ccccc1,2.07,test,2.000278,0.083664
11639,Cc1ccc2cccc(O)c2n1,2.33,test,2.300541,0.106845
...,...,...,...,...,...
13683,N#CNc1ccccc1,1.87,test,1.424028,0.081981
13684,NS(=O)(=O)c1ccc(S(=O)(=O)CCO)s1,-0.54,test,-0.552294,0.111033
13685,CCCCC(NC(Cc1ccccc1)C(=O)N1CCC(OCOC)CC1)C(=O)NC...,3.50,test,3.916647,0.199682
13686,O=C(COc1ccc2ccccc2c1)N1CCN(S(=O)(=O)c2ccc3c(c2...,4.47,test,4.558833,0.210158


In [17]:
y_pred_mean = torch.Tensor(np.array(df['pred_logp_42']))
r2 = r2_score(y_test, y_pred_mean)
rmse = np.sqrt(mean_squared_error(y_test, y_pred_mean))
mae = mean_absolute_error(y_test, y_pred_mean)
round(rmse, 3), round(r2, 3), round(mae, 3)

(0.449, 0.943, 0.269)