This is the code and procedure if you want to predict with trained XGBOOST models on new data

In [None]:
!pip install rdkit-pypi
!pip install mordred
!pip install xgboost

In [63]:
# import pandas for data wrangling
import pandas as pd
import numpy as np
pd.options.mode.chained_assignment = None  # default='warn'

import xgboost as xgb
from sklearn.metrics import accuracy_score


import matplotlib.pyplot as plt
import seaborn as sns
import random

from xgboost import XGBRegressor
from xgboost import plot_importance
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import StratifiedKFold, cross_val_score

from rdkit import Chem
from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator
from mordred import Calculator, descriptors
import mordred
# Set the seed
seed_value = 42
np.random.seed(seed_value)
random.seed(seed_value)

''' A workaround for a rdkit issue that sometimes occures when you calculate the descriptors due to numpy versions'''
np.float = float    
np.int = int   #module 'numpy' has no attribute 'int'
np.object = object    #module 'numpy' has no attribute 'object'
np.bool = bool    #module 'numpy' has no attribute 'bool'


out='results/xgboost/'#'path_to/out_folder/'

In [64]:
data_path= 'data/data_from_42_filtered.csv'#'path_to/data/your_data'
df = pd.read_csv(data_path,encoding='unicode_escape',index_col=0)
df.reset_index(inplace=True,drop=True)
df

Unnamed: 0,Name,Smile,avg.exp.value(cal/cc)1/2
0,"1,3- butadien",C=CC=CÃÂ ÃÂ,7.10
1,"1,3-BUTANEDIOL",CC(CCO)O,14.14
2,1-Chlorobutane,ÃÂ CCCCClÃÂ ÃÂ,8.44
3,1-Pentanol,CCCCCOÃÂ ÃÂ,10.60
4,2-Ethyl-1-hexanol,CCCCC(CC)COÃÂ ÃÂ,9.85
...,...,...,...
57,Succinic anhydride,C1CC(=O)OC1=OÃÂ,15.40
58,Tetrahydrofuran,C1CCOC1ÃÂ ÃÂ,9.10
59,Tetrahydronaphthalene,C1CCC2=CC=CC=C2C1ÃÂ ÃÂ,9.60
60,Toluene,CC1=CC=CC=C1ÃÂ ÃÂ,8.94


Calculate the descriptors:

In [65]:
smiles=df['Smile'].tolist()
mols = [Chem.MolFromSmiles(smile) for smile in smiles]

calc = Calculator(descriptors,ignore_3D=True) 
descs = calc.pandas(mols)

descs

 60%|█████▉    | 37/62 [00:00<00:00, 85.15it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 62/62 [00:00<00:00, 86.30it/s]


Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,2.121320,2.340100,0,0,4.472136,1.618034,3.236068,4.472136,1.118034,2.155909,...,5.509388,22.328143,54.046950,5.404695,10,1,10.0,8.0,2.5,1.250000
1,3.754314,4.057055,0,0,6.155367,1.902113,3.804226,6.155367,1.025895,2.5951,...,7.131699,29.439488,90.068080,5.629255,32,3,20.0,18.0,3.611111,1.583333
2,2.828427,3.146264,0,0,5.464102,1.732051,3.464102,5.464102,1.09282,2.390167,...,6.192362,25.583106,92.039278,6.574234,20,2,14.0,12.0,2.75,1.500000
3,3.535534,3.869735,0,0,6.987918,1.801938,3.603875,6.987918,1.164653,2.57983,...,6.608001,28.105124,88.088815,4.893823,35,3,18.0,16.0,3.0,1.750000
4,5.656854,6.142781,0,0,10.565187,2.042079,4.084158,10.565187,1.17391,3.008457,...,7.884953,35.263065,130.135765,4.819843,104,8,32.0,32.0,4.361111,2.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57,5.168527,5.239955,0,0,8.318418,2.198691,4.11092,8.318418,1.188345,2.861501,...,8.206584,47.382536,100.016044,9.092368,41,4,32.0,34.0,2.972222,1.583333
58,3.535534,3.535534,0,0,6.472136,2.0,3.618034,6.472136,1.294427,2.525424,...,7.147559,41.004802,72.057515,5.542886,15,0,20.0,20.0,1.25,1.250000
59,7.737734,6.634763,0,0,13.683239,2.302776,4.605551,13.683239,1.368324,3.249407,...,9.066816,39.856521,132.093900,6.004268,109,12,50.0,57.0,2.222222,2.277778
60,5.059137,4.785854,0,0,8.720566,2.101003,4.202006,8.720566,1.245795,2.844305,...,8.124151,33.544698,92.062600,6.137507,42,5,30.0,31.0,2.361111,1.666667


Choose the model (trained on 10, 30 or 50 features), select the descriptors

In [66]:
nfeats=50
out = out + str(nfeats)+'/'

In [67]:

D = f'trained_models/xgboost/{nfeats}/MODEL_D_XGBOOST.json'
P = f'trained_models/xgboost/{nfeats}/MODEL_P_XGBOOST.json'
H = f'trained_models/xgboost/{nfeats}/MODEL_H_XGBOOST.json'

D_model = xgb.XGBRegressor(enable_categorical=True) #for some reason without this it doesn't work
D_model.load_model(D)

P_model = xgb.XGBRegressor(enable_categorical=True)
P_model.load_model(P)

H_model = xgb.XGBRegressor(enable_categorical=True)
H_model.load_model(H)

D_features = D_model.feature_names_in_.tolist()
P_features = P_model.feature_names_in_.tolist()
H_features = H_model.feature_names_in_.tolist()

D_descs=descs[D_features]
P_descs=descs[P_features]
H_descs=descs[H_features]


Just in case,check for errors and remove molecules if they have any errors for their descriptors

In [68]:
ERRS = [mordred.error.MissingValueBase,
        mordred.error.Missing,
        mordred.error.Error,
        mordred.error.MultipleFragments,
        mordred.error.Missing3DCoordinate,
        mordred.error.Timeout]


In [69]:
# Function to check if a cell contains an error of specified types
def contains_error(cell):
    return any(isinstance(cell, error_type) for error_type in ERRS)

def errors(dataframe):
    # Iterate over rows and filter out rows with errors
    filtered_rows = []
    err_rows = []
    for index, row in dataframe.iterrows():
        if not any(row.apply(contains_error)):
            filtered_rows.append(row)
        else:
            err_rows.append(index)
            print(index)

    return err_rows
errs = list(set(errors(D_descs)+errors(P_descs)+errors(H_descs)))
print(errs)            
D_descs.drop(errs,inplace=True)
P_descs.drop(errs,inplace=True)
H_descs.drop(errs,inplace=True)

11
16
17
24
39
16
17
24
11
16
17
24
[39, 11, 16, 17, 24]


For some reason, some descriptors are not returned as float, even though they appear to be. If that is the case, then the code wont work - so you have to do this:

In [70]:
from pandas.api.types import is_object_dtype

for col in D_descs.columns:
    if is_object_dtype(D_descs[col]):
        D_descs[col]=D_descs[col].astype(float)
        
for col in P_descs.columns:
    if is_object_dtype(P_descs[col]):
        P_descs[col]=P_descs[col].astype(float)
        
for col in H_descs.columns:
    if is_object_dtype(H_descs[col]):
        H_descs[col]=H_descs[col].astype(float)

In [71]:
if len(errs)>0:
    df.drop(errs,inplace=True)

y_D = D_model.predict(D_descs)
y_P = P_model.predict(P_descs)
y_H = H_model.predict(H_descs)

df['D predicted'] = y_D
df['P predicted'] = y_P
df['H predicted'] = y_H

df['PREDICTED SOL^2'] = y_D**2+y_P**2+y_H**2
df['PREDICTED SOL'] = np.sqrt(df['PREDICTED SOL^2'])
#for ref 42 data:
#df['PREDICTED SOL (cal/cc)1/2'] = df['PREDICTED SOL']/2.0455
df

Unnamed: 0,Name,Smile,avg.exp.value(cal/cc)1/2,D predicted,P predicted,H predicted,PREDICTED SOL^2,PREDICTED SOL,PREDICTED SOL (cal/cc)1/2
0,"1,3- butadien",C=CC=CÃÂ ÃÂ,7.1,15.136312,2.267608,5.500339,264.503693,16.263569,7.950902
1,"1,3-BUTANEDIOL",CC(CCO)O,14.14,16.237263,8.501196,21.084248,780.464539,27.936796,13.657685
2,1-Chlorobutane,ÃÂ CCCCClÃÂ ÃÂ,8.44,15.783636,6.045918,2.512587,291.98938,17.087696,8.353799
3,1-Pentanol,CCCCCOÃÂ ÃÂ,10.6,15.625136,5.926874,14.453806,488.185211,22.094913,10.801718
4,2-Ethyl-1-hexanol,CCCCC(CC)COÃÂ ÃÂ,9.85,15.689551,5.207491,11.702744,410.234222,20.25424,9.901853
5,2-Ethylhexyl acrylate,CCCCC(CC)COC(=O)C=CÃÂ ÃÂ,8.64,15.510829,3.926084,4.540293,276.614197,16.631723,8.130884
6,Acetic acid,CC(=O)OÃÂ ÃÂ,10.35,15.358447,9.90042,23.988178,909.332886,30.155148,14.742188
7,Acetone,ÃÂ CC(=O)CÃÂ ÃÂ,9.77,15.481897,10.467478,5.78583,382.733063,19.563564,9.564197
8,Acetonitrile,ÃÂ CC#NÃÂ ÃÂ,11.92,15.008001,16.032125,6.816784,528.737732,22.994297,11.241406
9,Acrylic acid,C=CC(=O)O,12.3,16.439789,7.439268,12.406708,479.535767,21.898306,10.705601


In [72]:
#Save the prediction data
df.to_csv(out+'data_from_42_filtered_results.csv')
df.to_excel(out+'data_from_42_filtered_results.xlsx')