To run predictions, you first need to place the trained model file (.ckpt format) into the current directory

In [1]:
import pytorch_lightning as pl
import pandas as pd
import torch.nn as nn
import torch
from chython import smiles
from chytorch.utils.data import MoleculeDataset, chained_collate, collate_molecules
from torch.utils.data import DataLoader, TensorDataset
from model_p import Model, early_stop_callback, FeatureExtractorFreezeUnfreeze
import numpy as np
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from rdkit import Chem

In [2]:
def prepare_mol(mol_smi):
    try:
        mol = smiles(mol_smi)
        try:
            mol.kekule()
        except InvalidAromaticRing:
            mol = None
    except Exception:
        mol = None
    return mol

In [3]:
def read_smiles_from_sdf(sdf_path):
    suppl = Chem.SDMolSupplier(sdf_path)
    smiles_set = set()
    for mol in suppl:
        if mol is not None:
            smi = Chem.MolToSmiles(mol, isomericSmiles=True)
            smiles_set.add(smi)
    return smiles_set

def normalize_smiles(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        return Chem.MolToSmiles(mol, isomericSmiles=True)
    return None

In [4]:
df = pd.read_csv('GLP_dataset.csv')
df = df[['molecule', 'logp', 'dataset_type']]
df = df[df['dataset_type']=='test']

In [5]:
df

Unnamed: 0,molecule,logp,dataset_type
4,BrC(Br)C(Br)Br,2.55,test
40,BrCCCCCCCBr,4.47,test
53,BrCCN(CCBr)c1ccccc1,3.84,test
80,Brc1cc(Br)c(Br)c(Br)c1,5.55,test
83,Brc1cc(Br)c(Br)s1,4.48,test
...,...,...,...
41929,c1ccc2nonc2c1,1.69,test
41958,c1cnc2cscc2c1,1.74,test
41963,c1cncc(-c2ccnc(-c3cccnc3)c2)c1,1.96,test
41968,c1cncc(CCN2CCCCC2)c1,1.66,test


In [6]:
X_test = df['molecule']
X_test = X_test.apply(prepare_mol)
y_test = df['logp']

In [7]:
y_test

4        2.55
40       4.47
53       3.84
80       5.55
83       4.48
         ... 
41929    1.69
41958    1.74
41963    1.96
41968    1.66
41995    2.11
Name: logp, Length: 6301, dtype: float64

In [8]:
mols = X_test.to_list()
X_test = MoleculeDataset(mols)
y_test  = torch.Tensor(y_test.to_numpy())

In [9]:
X_test

<chytorch.utils.data.molecule.encoder.MoleculeDataset at 0x7fa3162eba00>

In [10]:
y_test.shape

torch.Size([6301])

In [11]:
seed = 42
model = Model.load_from_checkpoint("GraphormerLogP_GLP_train_42.ckpt", loss_function=nn.HuberLoss(), learning_rate=4e-4)

In [12]:
test = DataLoader(
            dataset=TensorDataset(X_test, y_test),
            collate_fn=chained_collate(collate_molecules, torch.stack),
            batch_size=32,
            shuffle=False, num_workers=7)


lr_monitor = pl.callbacks.LearningRateMonitor(logging_interval='epoch')
trainer = pl.Trainer(accelerator='gpu', devices=1, min_epochs=50, max_epochs=500,
                         callbacks=[FeatureExtractorFreezeUnfreeze(),
                                    early_stop_callback, lr_monitor],
                         check_val_every_n_epoch=5, log_every_n_steps=38)
temp = trainer.predict(model, test)
y_pred = []
y_uncert = []
for batch in temp:
    preds, uncerts = batch
    y_pred.extend(preds.squeeze().tolist())
    y_uncert.extend(uncerts.squeeze().tolist())

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/home/slavick/miniconda3/envs/logP_env/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:75: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
Missing logger folder: /home/slavick/Test_GLP/GraphormerLogP/GLP_dataset/lightning_logs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting DataLoader 0: 100%|██████████| 197/197 [00:07<00:00, 27.24it/s]


In [13]:
y_pred = torch.Tensor(np.array(y_pred))
y_uncert = torch.Tensor(np.array(y_uncert))

In [14]:
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
round(rmse, 3), round(r2, 3), round(mae, 3)

(0.436, 0.958, 0.253)

In [15]:
y_pred.cpu().detach().numpy()
y_uncert.cpu().detach().numpy()
df[f'pred_logp_{seed}'] = y_pred
df[f'pred_uncert_{seed}'] = np.sqrt(np.exp(y_uncert))

In [16]:
df.to_csv('Seeds_GLP_PREDICT_GLP_train_GLP_test.csv', index=False)
df

Unnamed: 0,molecule,logp,dataset_type,pred_logp_42,pred_uncert_42
4,BrC(Br)C(Br)Br,2.55,test,2.596970,0.121303
40,BrCCCCCCCBr,4.47,test,4.367506,0.177158
53,BrCCN(CCBr)c1ccccc1,3.84,test,3.583178,0.149329
80,Brc1cc(Br)c(Br)c(Br)c1,5.55,test,5.420713,0.251250
83,Brc1cc(Br)c(Br)s1,4.48,test,4.362076,0.186974
...,...,...,...,...,...
41929,c1ccc2nonc2c1,1.69,test,1.549164,0.089958
41958,c1cnc2cscc2c1,1.74,test,1.987525,0.110805
41963,c1cncc(-c2ccnc(-c3cccnc3)c2)c1,1.96,test,2.053985,0.122008
41968,c1cncc(CCN2CCCCC2)c1,1.66,test,1.926371,0.102949


In [17]:
y_pred_mean = torch.Tensor(np.array(df['pred_logp_42']))
r2 = r2_score(y_test, y_pred_mean)
rmse = np.sqrt(mean_squared_error(y_test, y_pred_mean))
mae = mean_absolute_error(y_test, y_pred_mean)
round(rmse, 3), round(r2, 3), round(mae, 3)

(0.436, 0.958, 0.253)