This is the code and procedure if you want to predict with trained XGBOOST models on new data

In [None]:
!pip install rdkit-pypi
!pip install mordred
!pip install xgboost

In [216]:
# import pandas for data wrangling
import pandas as pd
import numpy as np

import xgboost as xgb
from sklearn.metrics import accuracy_score


import matplotlib.pyplot as plt
import seaborn as sns
import random

from xgboost import XGBRegressor
from xgboost import plot_importance
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import StratifiedKFold, cross_val_score

from rdkit import Chem
from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator
from mordred import Calculator, descriptors
import mordred
# Set the seed
seed_value = 42
np.random.seed(seed_value)
random.seed(seed_value)

''' A workaround for a rdkit issue that sometimes occures when you calculate the descriptors due to numpy versions'''
np.float = float    
np.int = int   #module 'numpy' has no attribute 'int'
np.object = object    #module 'numpy' has no attribute 'object'
np.bool = bool    #module 'numpy' has no attribute 'bool'


out='results/xgboost/'#'path_to/out_folder/'

In [245]:
data_path= 'data/data_from_6_filtered.csv'#'path_to/data/your_data'
df = pd.read_csv(data_path,encoding='unicode_escape',index_col=0)
df.reset_index(inplace=True,drop=True)
df

Unnamed: 0,Name,Smile,total,H,P,D
0,3-METHYLPENTANE,CCC(C)CC,14.67,0.0,0.0,14.67
1,"2,4-DIMETHYLHEXANE",CCC(C)CC(C)C,14.65,0.0,0.0,14.65
2,benzene,C1=CC=CC=C1,18.41,2.05,1.02,18.267052
3,toluene,CC1=CC=CC=C1,18.32,2.0,1.4,18.156608
4,STYRENE,C=CC1=CC=CC=C1,19.07,4.1,1.0,18.597175
5,O-XYLENE,CC1=CC=CC=C1C,18.2,3.1,1.0,17.906144
6,tetralin,C1CCC2=CC=CC=C2C1,19.8,2.9,2.0,19.484096
7,acetone,CC(=O)C,19.95,6.95,10.43,15.52144
8,ACETOPHENONE,CC(=O)C1=CC=CC=C1,21.73,3.68,8.59,19.61791
9,ETHYL ACETATE,CCOC(=O)C,18.48,9.2,5.85,14.921391


Calculate the descriptors:

In [246]:
smiles=df['Smile'].tolist()
mols = [Chem.MolFromSmiles(smile) for smile in smiles]

calc = Calculator(descriptors,ignore_3D=True) 
descs = calc.pandas(mols)

descs

100%|██████████| 31/31 [00:00<00:00, 215.75it/s]


Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,3.644924,4.097495,0,0,6.898979,1.931852,3.863703,6.898979,1.14983,2.5958,...,7.280008,29.753427,86.10955,4.305478,31,4,20.0,19.0,3.611111,1.666667
1,5.277917,5.655215,0,0,8.565187,2.042079,4.084158,8.565187,1.070648,2.899769,...,7.8842,34.080836,114.140851,4.390033,71,6,30.0,29.0,4.722222,2.0
2,4.242641,4.0,0,0,8.0,2.0,4.0,8.0,1.333333,2.687624,...,7.627057,30.941317,78.04695,6.503913,27,3,24.0,24.0,1.5,1.5
3,5.059137,4.785854,0,0,8.720566,2.101003,4.202006,8.720566,1.245795,2.844305,...,8.124151,33.544698,92.0626,6.137507,42,5,30.0,31.0,2.361111,1.666667
4,5.656854,5.42766,0,0,10.424292,2.135779,4.271558,10.424292,1.303037,2.969338,...,8.298291,35.247635,104.0626,6.503913,64,7,34.0,36.0,2.611111,2.0
5,5.835194,5.606207,0,0,9.95396,2.193527,4.387054,9.95396,1.244245,2.980241,...,8.555837,35.908899,106.07825,5.893236,60,8,36.0,39.0,3.222222,1.861111
6,7.737734,6.634763,0,0,13.683239,2.302776,4.605551,13.683239,1.368324,3.249407,...,9.066816,39.856521,132.0939,6.004268,109,12,50.0,57.0,2.222222,2.277778
7,2.44949,2.44949,0,0,3.464102,1.732051,3.464102,3.464102,0.866025,2.178059,...,6.188264,24.179697,58.041865,5.804186,9,0,12.0,9.0,3.111111,1.0
8,6.542301,6.236096,0,0,11.189957,2.193993,4.387987,11.189957,1.243329,3.089765,...,8.590258,37.289972,120.057515,7.062207,88,9,40.0,43.0,3.472222,2.111111
9,3.754314,4.057055,0,0,6.155367,1.902113,3.804226,6.155367,1.025895,2.5951,...,7.131699,29.439488,88.052429,6.289459,32,3,20.0,18.0,3.611111,1.583333


Choose the model (trained on 10, 30 or 50 features), select the descriptors

In [219]:
nfeats=10
out = out + str(nfeats)+'/'

In [220]:

D = f'trained_models/xgboost/{nfeats}/MODEL_D_XGBOOST.json'
P = f'trained_models/xgboost/{nfeats}/MODEL_P_XGBOOST.json'
H = f'trained_models/xgboost/{nfeats}/MODEL_H_XGBOOST.json'

D_model = xgb.XGBRegressor(enable_categorical=True) #for some reason without this it doesn't work
D_model.load_model(D)

P_model = xgb.XGBRegressor(enable_categorical=True)
P_model.load_model(P)

H_model = xgb.XGBRegressor(enable_categorical=True)
H_model.load_model(H)

D_features = D_model.feature_names_in_.tolist()
P_features = P_model.feature_names_in_.tolist()
H_features = H_model.feature_names_in_.tolist()

D_descs=descs[D_features]
P_descs=descs[P_features]
H_descs=descs[H_features]


Just in case,check for errors and remove molecules if they have any errors for their descriptors

In [221]:
ERRS = [mordred.error.MissingValueBase,
        mordred.error.Missing,
        mordred.error.Error,
        mordred.error.MultipleFragments,
        mordred.error.Missing3DCoordinate,
        mordred.error.Timeout]


In [222]:
# Function to check if a cell contains an error of specified types
def contains_error(cell):
    return any(isinstance(cell, error_type) for error_type in ERRS)

def errors(dataframe):
    # Iterate over rows and filter out rows with errors
    filtered_rows = []
    err_rows = []
    for index, row in dataframe.iterrows():
        if not any(row.apply(contains_error)):
            filtered_rows.append(row)
        else:
            err_rows.append(index)
            print(index)

    return err_rows
errs = list(set(errors(D_descs)+errors(P_descs)+errors(H_descs)))
print(errs)            
D_descs.drop(errs,inplace=True)
P_descs.drop(errs,inplace=True)
H_descs.drop(errs,inplace=True)

[]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  D_descs.drop(errs,inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  P_descs.drop(errs,inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  H_descs.drop(errs,inplace=True)


In [223]:
D_descs

Unnamed: 0,fMF,GATS1v,AATS2p,AATS1d,ETA_psi_1,SMR_VSA5,nHetero,ABCGG,SM1_DzZ,VSA_EState8
0,0.000000,1.417230,1.218726,2.625000,0.579937,39.530761,2,8.698271,0.500000,8.191183
1,0.000000,1.513365,1.052109,2.500000,0.418478,19.889315,3,4.244375,0.642857,3.083333
2,0.400000,0.817796,1.721912,3.600000,0.531915,0.000000,2,6.236096,0.500000,0.000000
3,0.348837,1.019023,1.522656,3.590909,0.565217,26.372262,4,14.098377,1.000000,2.532094
4,0.240000,1.036366,1.508772,3.240000,0.579937,13.530619,2,8.658929,0.500000,5.437097
...,...,...,...,...,...,...,...,...,...,...
69,0.237288,1.326503,1.292225,3.575000,0.504342,179.177436,14,32.313871,3.392857,16.331180
70,0.395833,1.017913,1.575161,4.058824,0.599303,24.987450,4,14.851657,0.785714,6.068442
71,0.187500,0.812974,1.666130,4.093750,0.469650,16.982077,9,13.291655,2.829832,-0.551198
72,0.314815,1.028654,1.619472,4.368421,0.552194,57.656600,5,17.255118,1.250000,3.341813


For some reason, some descriptors are not returned as float, even though they appear to be. If that is the case, then the code wont work - so you have to do this:

In [224]:
from pandas.api.types import is_object_dtype

for col in D_descs.columns:
    if is_object_dtype(D_descs[col]):
        D_descs[col]=D_descs[col].astype(float)
        
for col in P_descs.columns:
    if is_object_dtype(P_descs[col]):
        P_descs[col]=P_descs[col].astype(float)
        
for col in H_descs.columns:
    if is_object_dtype(H_descs[col]):
        H_descs[col]=H_descs[col].astype(float)

In [225]:
if len(errs)>0:
    df.drop(errs,inplace=True)

y_D = D_model.predict(D_descs)
y_P = P_model.predict(P_descs)
y_H = H_model.predict(H_descs)

df['D predicted'] = y_D
df['P predicted'] = y_P
df['H predicted'] = y_H

df['PREDICTED SOL^2'] = y_D**2+y_P**2+y_H**2
df['PREDICTED SOL'] = np.sqrt(df['PREDICTED SOL^2'])
df

Unnamed: 0,key,Type,Î´d,Î´p,Î´h,smiles,ID_type,ID,D predicted,P predicted,H predicted,PREDICTED SOL^2,PREDICTED SOL
0,2-Ethyl Hexyl Acrylate,Solvent,14.80,4.70,3.40,C=CC(=O)OCC(CC)CCCC,CAS,103-11-7,15.643625,4.161947,5.260853,289.721375,17.021204
1,2-Nitropropane,Solvent,16.20,12.10,4.10,CC(C)[N+](=O)[O-],CAS,79-46-9,16.129251,13.788355,5.766659,483.525848,21.989222
2,Benzoic Acid,Solvent,17.63,10.10,10.74,O=C(O)c1ccccc1,CAS,65-85-0,19.768663,7.465450,12.118399,593.388550,24.359568
3,Benzyl Butyl Phthalate,Solvent,19.00,11.20,3.10,CCCCOC(=O)c1ccccc1C(=O)OCc1ccccc1,CAS,85-68-7,18.524000,5.929254,4.455266,398.144043,19.953547
4,Benzyl Methacrylate,Solvent,16.80,4.10,4.10,C=C(C)C(=O)OCc1ccccc1,CAS,2495-37-6,17.935057,5.500400,5.252326,379.507568,19.480953
...,...,...,...,...,...,...,...,...,...,...,...,...,...
69,Erythromycin,Solvent,18.09,3.35,15.65,CC[C@H]1OC(=O)[C@H](C)[C@@H](O[C@H]2C[C@@](C)(...,,,15.305136,2.809570,9.100919,324.967590,18.026857
70,Quinidine,Solvent,20.72,5.53,11.97,C=C[C@H]1CN2CC[C@H]1C[C@@H]2[C@@H](O)c1ccnc2cc...,,,19.047729,6.971541,9.680942,505.139038,22.475298
71,Chloramphenicol,Solvent,23.06,9.50,18.68,O=C(N[C@H](CO)[C@H](O)c1ccc([N+](=O)[O-])cc1)C...,,,17.634207,6.601490,14.537505,565.883972,23.788317
72,Propranolol,Solvent,19.57,3.35,11.04,C[C@]12C[C@H](O)[C@H]3[C@@H](CCC4=CC(=O)C=C[C@...,,,18.159029,6.739834,11.107193,498.545410,22.328131


In [226]:
#Save the prediction data
df.to_csv(out+'data_from_6_filtered_results.csv')