To run predictions, you first need to place the trained model file (.joblib format) into the current directory

In [1]:
from sklearn.preprocessing import FunctionTransformer
import pandas as pd
from matplotlib import rcParams
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import numpy as np

from rdkit.Chem import Descriptors
from rdkit import Chem
from sklearn.compose import ColumnTransformer
from numpy import array
from joblib import load

rcParams['figure.dpi'] = 900


In [2]:
seed = 42

In [3]:
grid = load(f'RF_RDKit_GLP_train_Seed_{seed}.joblib')
grid

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [4]:
train_data = pd.read_csv('GLP_dataset.csv')
train_data = train_data[train_data['dataset_type'] == 'train']
y_train = train_data['logp']
train_data = pd.DataFrame(train_data['molecule'].map(lambda x: Chem.MolFromSmiles(x)))
train_data.head()

Unnamed: 0,molecule
0,<rdkit.Chem.rdchem.Mol object at 0x0000021D280...
2,<rdkit.Chem.rdchem.Mol object at 0x0000021D280...
3,<rdkit.Chem.rdchem.Mol object at 0x0000021D280...
5,<rdkit.Chem.rdchem.Mol object at 0x0000021D280...
8,<rdkit.Chem.rdchem.Mol object at 0x0000021D280...


In [5]:
test_data = pd.read_csv('GLP_dataset.csv')
test_data = test_data[test_data['dataset_type'] == 'test']
y_test = test_data['logp']
test_data = pd.DataFrame(test_data['molecule'].map(lambda x: Chem.MolFromSmiles(x)))
test_data.head()

Unnamed: 0,molecule
4,<rdkit.Chem.rdchem.Mol object at 0x0000021D27A...
40,<rdkit.Chem.rdchem.Mol object at 0x0000021D27A...
53,<rdkit.Chem.rdchem.Mol object at 0x0000021D27A...
80,<rdkit.Chem.rdchem.Mol object at 0x0000021D27A...
83,<rdkit.Chem.rdchem.Mol object at 0x0000021D27A...


In [6]:
ConstDescriptors = {"HeavyAtomCount": Descriptors.HeavyAtomCount,
                    "NHOHCount": Descriptors.NHOHCount,
                    "NOCount": Descriptors.NOCount,
                    "NumHAcceptors": Descriptors.NumHAcceptors,
                    "NumHDonors": Descriptors.NumHDonors,
                    "NumHeteroatoms": Descriptors.NumHeteroatoms,
                    "NumRotatableBonds": Descriptors.NumRotatableBonds,
                    "NumValenceElectrons": Descriptors.NumValenceElectrons,
                    "NumAromaticRings": Descriptors.NumAromaticRings,
                    "NumAliphaticHeterocycles": Descriptors.NumAliphaticHeterocycles,
                    "RingCount": Descriptors.RingCount}

PhisChemDescriptors = {"MW": Descriptors.MolWt,
                       "MR": Descriptors.MolMR,
                       "TPSA": Descriptors.TPSA}

descriptors = {}
descriptors.update(ConstDescriptors)
descriptors.update(PhisChemDescriptors)
descriptors_names_list = [key for key in list(descriptors.keys())]

def mol_dsc_calc(mols):
    return pd.DataFrame({k: f(m) for k, f in descriptors.items()}
             for m in array(mols).ravel())

def descriptors_names(transformer, mol_dsc_calc_obj):
    return [f'{key}' for key in list(descriptors.keys())]

descriptors_transformer = FunctionTransformer(mol_dsc_calc, validate=False, feature_names_out=descriptors_names)

features = ColumnTransformer([('descriptors', descriptors_transformer, [0])])

X_train = features.fit_transform(train_data)
X_test = features.fit_transform(test_data)
print(X_train.shape)
print(X_test.shape)

(29404, 14)
(6301, 14)


In [7]:
y_pred = grid.predict(X_test)
y_pred

array([3.17771733, 4.55152   , 4.28341834, ..., 2.33494133, 2.11674   ,
       1.44364257])

In [8]:
df = test_data
df['logp'] = y_test 

In [9]:
df[f'logP_pred_{seed}'] = y_pred

In [10]:
df

Unnamed: 0,molecule,logp,logP_pred_42
4,<rdkit.Chem.rdchem.Mol object at 0x0000021D27A...,2.55,3.177717
40,<rdkit.Chem.rdchem.Mol object at 0x0000021D27A...,4.47,4.551520
53,<rdkit.Chem.rdchem.Mol object at 0x0000021D27A...,3.84,4.283418
80,<rdkit.Chem.rdchem.Mol object at 0x0000021D27A...,5.55,5.296022
83,<rdkit.Chem.rdchem.Mol object at 0x0000021D27A...,4.48,3.787075
...,...,...,...
41929,<rdkit.Chem.rdchem.Mol object at 0x0000021D288...,1.69,0.617183
41958,<rdkit.Chem.rdchem.Mol object at 0x0000021D288...,1.74,1.960103
41963,<rdkit.Chem.rdchem.Mol object at 0x0000021D288...,1.96,2.334941
41968,<rdkit.Chem.rdchem.Mol object at 0x0000021D288...,1.66,2.116740


In [11]:
r2 = r2_score(df['logp'], df[f'logP_pred_{seed}'])
rmse = np.sqrt(mean_squared_error(df['logp'], df[f'logP_pred_{seed}']))
mae = mean_absolute_error(df['logp'], df[f'logP_pred_{seed}'])
round(rmse, 3), round(r2, 3), round(mae, 3)

(0.861, 0.835, 0.56)

In [12]:
df.to_csv('Seeds_Predict_RF_GLP_train_GLP_test.csv', index=False)