This is the code and procedure if you want to predict with trained XGBOOST models on new data

In [None]:
!pip install rdkit-pypi
!pip install mordred
!pip install xgboost

In [182]:
# import pandas for data wrangling
import pandas as pd
import numpy as np

import xgboost as xgb
from sklearn.metrics import accuracy_score


import matplotlib.pyplot as plt
import seaborn as sns
import random

from xgboost import XGBRegressor
from xgboost import plot_importance
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import StratifiedKFold, cross_val_score

from rdkit import Chem
from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator
from mordred import Calculator, descriptors
import mordred
# Set the seed
seed_value = 42
np.random.seed(seed_value)
random.seed(seed_value)

''' A workaround for a rdkit issue that sometimes occures when you calculate the descriptors due to numpy versions'''
np.float = float    
np.int = int   #module 'numpy' has no attribute 'int'
np.object = object    #module 'numpy' has no attribute 'object'
np.bool = bool    #module 'numpy' has no attribute 'bool'


out='results/xgboost/'#'path_to/out_folder/'

In [183]:
data_path= 'data/data_from_42_filtered.csv'#'path_to/data/your_data'
df = pd.read_csv(data_path,encoding='unicode_escape',index_col=0)
df.reset_index(inplace=True,drop=True)
df

Unnamed: 0,Name,Smile
0,"1,3- butadien",C=CC=CÃÂ ÃÂ
1,"1,3-BUTANEDIOL",CC(CCO)O
2,1-Chlorobutane,ÃÂ CCCCClÃÂ ÃÂ
3,1-Pentanol,CCCCCOÃÂ ÃÂ
4,2-Ethyl-1-hexanol,CCCCC(CC)COÃÂ ÃÂ
...,...,...
79,N-Methylacetamide,CC(=O)NCÃÂ ÃÂ
80,N-Methyl N-vinylacetamide,CC(=O)N(C)C=CÃÂ ÃÂ
81,N-Methyl Pyrrolidinone,CN1CCCC1=OÃÂ ÃÂ
82,"N,N dimethylacrylamide",CN(C)C(=O)C=CÃÂ


Calculate the descriptors:

In [184]:
smiles=df['Smile'].tolist()
mols = [Chem.MolFromSmiles(smile) for smile in smiles]

calc = Calculator(descriptors,ignore_3D=True) 
descs = calc.pandas(mols)

descs

 25%|██▌       | 21/84 [00:00<00:00, 117.03it/s]



100%|██████████| 84/84 [00:00<00:00, 189.82it/s]


Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,2.121320,2.340100,0,0,4.472136,1.618034,3.236068,4.472136,1.118034,2.155909,...,5.509388,22.328143,54.046950,5.404695,10,1,10.0,8.0,2.5,1.250000
1,3.754314,4.057055,0,0,6.155367,1.902113,3.804226,6.155367,1.025895,2.5951,...,7.131699,29.439488,90.068080,5.629255,32,3,20.0,18.0,3.611111,1.583333
2,2.828427,3.146264,0,0,5.464102,1.732051,3.464102,5.464102,1.09282,2.390167,...,6.192362,25.583106,92.039278,6.574234,20,2,14.0,12.0,2.75,1.500000
3,3.535534,3.869735,0,0,6.987918,1.801938,3.603875,6.987918,1.164653,2.57983,...,6.608001,28.105124,88.088815,4.893823,35,3,18.0,16.0,3.0,1.750000
4,5.656854,6.142781,0,0,10.565187,2.042079,4.084158,10.565187,1.17391,3.008457,...,7.884953,35.263065,130.135765,4.819843,104,8,32.0,32.0,4.361111,2.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79,3.047207,3.305183,0,0,5.226252,1.847759,3.695518,5.226252,1.04525,2.408576,...,6.834109,27.254130,73.052764,6.087730,18,2,16.0,14.0,3.361111,1.333333
80,4.530370,5.004088,0,0,7.662988,2.052881,4.105762,7.662988,1.094713,2.766317,...,7.890957,32.688753,99.068414,6.191776,46,6,26.0,26.0,4.472222,1.777778
81,5.128087,5.320836,0,0,8.428639,2.21432,4.21432,8.428639,1.204091,2.862063,...,8.302762,47.572444,99.068414,6.191776,40,5,32.0,35.0,2.972222,1.611111
82,4.530370,5.004088,0,0,7.662988,2.052881,4.105762,7.662988,1.094713,2.766317,...,7.890957,32.688753,99.068414,6.191776,46,6,26.0,26.0,4.472222,1.777778


Choose the model (trained on 10, 30 or 50 features), select the descriptors

In [185]:
nfeats=50
out = out + str(nfeats)+'/'

In [186]:

D = f'trained_models/xgboost/{nfeats}/MODEL_D_XGBOOST.json'
P = f'trained_models/xgboost/{nfeats}/MODEL_P_XGBOOST.json'
H = f'trained_models/xgboost/{nfeats}/MODEL_H_XGBOOST.json'

D_model = xgb.XGBRegressor(enable_categorical=True) #for some reason without this it doesn't work
D_model.load_model(D)

P_model = xgb.XGBRegressor(enable_categorical=True)
P_model.load_model(P)

H_model = xgb.XGBRegressor(enable_categorical=True)
H_model.load_model(H)

D_features = D_model.feature_names_in_.tolist()
P_features = P_model.feature_names_in_.tolist()
H_features = H_model.feature_names_in_.tolist()

D_descs=descs[D_features]
P_descs=descs[P_features]
H_descs=descs[H_features]


Just in case,check for errors and remove molecules if they have any errors for their descriptors

In [187]:
ERRS = [mordred.error.MissingValueBase,
        mordred.error.Missing,
        mordred.error.Error,
        mordred.error.MultipleFragments,
        mordred.error.Missing3DCoordinate,
        mordred.error.Timeout]


In [188]:
# Function to check if a cell contains an error of specified types
def contains_error(cell):
    return any(isinstance(cell, error_type) for error_type in ERRS)

def errors(dataframe):
    # Iterate over rows and filter out rows with errors
    filtered_rows = []
    err_rows = []
    for index, row in dataframe.iterrows():
        if not any(row.apply(contains_error)):
            filtered_rows.append(row)
        else:
            err_rows.append(index)
            print(index)

    return err_rows
errs = list(set(errors(D_descs)+errors(P_descs)+errors(H_descs)))
print(errs)            
D_descs.drop(errs,inplace=True)
P_descs.drop(errs,inplace=True)
H_descs.drop(errs,inplace=True)

11
16
17
24
39
64
16
17
24
11
16
17
24
64
[64, 39, 11, 16, 17, 24]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  D_descs.drop(errs,inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  P_descs.drop(errs,inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  H_descs.drop(errs,inplace=True)


In [189]:
D_descs

Unnamed: 0,naRing,nAromAtom,ETA_epsilon_3,fMF,AMID,BCUTp-1h,AATS2p,Mv,Xch-6d,nHRing,...,ATSC0d,ATSC1p,MZ,SpMAD_Dzp,ETA_dEpsilon_D,NsOH,ETA_beta_s,ATSC0s,VSA_EState9,VE1_DzZ
0,0,0,0.414286,0.0000,1.71783,1.937229,1.281282,0.562548,0.000000,0,...,1.600000,-3.623127e-01,0.500000,1.780776,0.0,0,1.5,6.4,0.000000,1.985015
1,0,0,0.42,0.0000,1.779863,1.843941,1.073049,0.508671,0.000000,0,...,5.000000,-7.623493e-02,0.520833,4.987263,0.163889,2,3.0,42.104167,0.000000,2.413803
2,0,0,0.417647,0.0000,1.769975,2.20365,1.219557,0.537792,0.000000,0,...,2.357143,-3.702603e-01,0.595238,3.157039,0.0,0,2.25,1.291073,0.000000,2.20578
3,0,0,0.42,0.0000,1.806525,1.849369,1.094570,0.498098,0.000000,0,...,3.111111,-2.657232e-01,0.462963,4.704207,0.075397,1,2.75,23.444444,0.000000,2.413452
4,0,0,0.424138,0.0000,1.851579,1.876194,1.183697,0.503380,0.000000,0,...,7.185185,-2.289782e-01,0.456790,5.846702,0.048519,1,4.25,24.779835,0.000000,2.947101
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79,0,0,0.417647,0.0000,1.73936,1.814371,0.960181,0.530763,0.000000,0,...,4.250000,-7.289924e-01,0.555556,3.883245,0.105556,0,2.75,32.081019,1.599537,2.199621
80,0,0,0.421739,0.0000,1.786463,1.888686,1.124570,0.556937,0.000000,0,...,7.437500,-1.173714e+00,0.562500,4.840324,0.0,0,4.0,34.416667,1.662037,2.606824
81,0,0,0.433333,0.3125,1.92199,1.851822,1.243684,0.556937,0.235702,1,...,7.937500,-7.457640e-01,0.562500,4.242646,0.0,0,4.5,32.734375,1.842593,2.622777
82,0,0,0.421739,0.0000,1.786463,1.909794,1.132560,0.556937,0.000000,0,...,7.437500,-1.173714e+00,0.562500,4.71551,0.0,0,4.0,34.416667,3.369074,2.607111


For some reason, some descriptors are not returned as float, even though they appear to be. If that is the case, then the code wont work - so you have to do this:

In [190]:
from pandas.api.types import is_object_dtype

for col in D_descs.columns:
    if is_object_dtype(D_descs[col]):
        D_descs[col]=D_descs[col].astype(float)
        
for col in P_descs.columns:
    if is_object_dtype(P_descs[col]):
        P_descs[col]=P_descs[col].astype(float)
        
for col in H_descs.columns:
    if is_object_dtype(H_descs[col]):
        H_descs[col]=H_descs[col].astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  D_descs[col]=D_descs[col].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  D_descs[col]=D_descs[col].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  D_descs[col]=D_descs[col].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row

In [191]:
if len(errs)>0:
    df.drop(errs,inplace=True)

y_D = D_model.predict(D_descs)
y_P = P_model.predict(P_descs)
y_H = H_model.predict(H_descs)

df['D predicted'] = y_D
df['P predicted'] = y_P
df['H predicted'] = y_H

df['PREDICTED SOL^2'] = y_D**2+y_P**2+y_H**2
df['PREDICTED SOL'] = np.sqrt(df['PREDICTED SOL^2'])
df

Unnamed: 0,Name,Smile,D predicted,P predicted,H predicted,PREDICTED SOL^2,PREDICTED SOL
0,"1,3- butadien",C=CC=CÃÂ ÃÂ,15.136312,2.267608,5.500339,264.503693,16.263569
1,"1,3-BUTANEDIOL",CC(CCO)O,16.237263,8.501196,21.084248,780.464539,27.936796
2,1-Chlorobutane,ÃÂ CCCCClÃÂ ÃÂ,15.783636,6.045918,2.512587,291.989380,17.087696
3,1-Pentanol,CCCCCOÃÂ ÃÂ,15.625136,5.926874,14.453806,488.185211,22.094913
4,2-Ethyl-1-hexanol,CCCCC(CC)COÃÂ ÃÂ,15.689551,5.207491,11.702744,410.234222,20.254240
...,...,...,...,...,...,...,...
79,N-Methylacetamide,CC(=O)NCÃÂ ÃÂ,16.675545,13.863833,10.825989,587.481689,24.238022
80,N-Methyl N-vinylacetamide,CC(=O)N(C)C=CÃÂ ÃÂ,15.720427,13.521367,7.394744,484.641418,22.014572
81,N-Methyl Pyrrolidinone,CN1CCCC1=OÃÂ ÃÂ,18.115820,12.472983,7.074144,533.801758,23.104151
82,"N,N dimethylacrylamide",CN(C)C(=O)C=CÃÂ,15.720427,13.082963,7.394744,472.977966,21.748056


In [192]:
#Save the prediction data
df.to_csv(out+'data_from_42_filtered_results.csv')