In [None]:
import numpy as np
import pandas as pd
from rdkit import Chem
#Read substructures and datasets
datasets = pd.read_excel('ML-improve.xlsx') #Not provided
Dipyrone = datasets[datasets['Types of contaminants'] == 'Q']
substructure = pd.read_csv('New Match1.txt',header = None)
concatenated = pd.concat([Dipyrone.iloc[:, 7:27]], axis=1)
arr3 = concatenated.to_numpy()

In [None]:
#Match all substructures of SMILES (and record the quantity)
arr1 =[[0 for x in range(len(substructure))] for y in range(len(datasets))]
smis =datasets.SMILES
i = 0
j = 0
for smi in smis:
    mol = Chem.MolFromSmiles(smi)
    for sub in substructure[0].values :
        subMol = Chem.MolFromSmarts(sub)
        matches = mol.GetSubstructMatches(subMol)
        if len(matches):
            arr1[i][j] = len(matches)
            if sub =='c': 
                arr1[i][j] = arr1[i][j]/6
        j = j+1
    i= i+1
    j=0

arr = np.hstack((arr1, arr3))

In [None]:
#Containing phenyl group
row_numbers = []

for index, row in enumerate(arr1):
    if row[1] != 0:
        row_numbers.append(index) 
selected_rows = datasets.loc[row_numbers]
datasets = selected_rows 

arr1 =[[0 for x in range(len(substructure))] for y in range(len(datasets))]
smis =datasets.SMILES
i = 0
j = 0
for smi in smis:
    mol = Chem.MolFromSmiles(smi)
    for sub in substructure[0].values :
        subMol = Chem.MolFromSmarts(sub)
        matches = mol.GetSubstructMatches(subMol)
        if len(matches):
            arr1[i][j] = len(matches)
            if sub =='c':
                arr1[i][j] = arr1[i][j]/6
        j = j+1
    i= i+1
    j=0

arr3 = arr[row_numbers]

In [None]:
X = arr3
y = Dipyrone.iloc[:, 27]


import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn import metrics

#Divide the training set and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)#37

#Initialize the model
model = xgb.XGBRegressor()

#Set hyperparameter candidate values
param_grid = {
    'colsample_bytree': [0.3,0.4,0.5,0.6,0.7,0.8,0.9],
    'learning_rate': [0.1, 0.01],
    'max_depth': [3,4, 5,6,7,8,9,10],
    'alpha': [10, 20, 30,40,50,60,70],
    'n_estimators': [10,20,30,40 ,50,60,70,80,90,100]
}

#Create GridSearchCV object
grid_search = GridSearchCV(model, param_grid, scoring='r2', cv=5,n_jobs = -1)

#Perform grid search
grid_search.fit(X_train, y_train)

#Output the optimal hyperparameter combination
print("Best parameters found: ", grid_search.best_params_)

#Use the model with optimal parameters for prediction
best_model = grid_search.best_estimator_
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)
print('training R2:', metrics.r2_score(y_train,y_train_pred))
print('testing R2:', metrics.r2_score(y_test,y_test_pred))

In [None]:
#Find the optimal random_state

import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn import metrics

#Parameter candidate values - obtained from the above training, the following is a randomly given set of parameters
param_grid = {
    'colsample_bytree': [0.6],
    'learning_rate': [0.1],
    'max_depth': [7],
    'alpha': [10],
    'n_estimators': [100]
}                                                                            

#Initialize variables to store the best model and the highest R2 score
best_r2 = -np.inf
best_model = None
best_random_state = None


for random_state in range(160):

    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)
    
   
    model = xgb.XGBRegressor()
    
    
    grid_search = GridSearchCV(model, param_grid, scoring='r2', cv=5, n_jobs=-1)
    
    
    grid_search.fit(X_train, y_train)
    
    
    y_test_pred = grid_search.best_estimator_.predict(X_test)
    test_r2 = metrics.r2_score(y_test, y_test_pred)
    
    #If the R2 score of the current model is higher than the previous best score, update the optimal model and score
    if test_r2 > 0.8:
        best_r2 = test_r2
        best_model = grid_search.best_estimator_
        best_random_state = random_state
        y_train_pred = best_model.predict(X_train)
        y_test_pred = best_model.predict(X_test)
        print('Best random_state:', best_random_state)
        print('training R2:', metrics.r2_score(y_train, y_train_pred))
        print('test R2:', metrics.r2_score(y_test, y_test_pred))


y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)

# 输出训练和测试 R2 分数
print("Best parameters found: ", grid_search.best_params_)
print('Best random_state:', best_random_state)
print('training R2:', metrics.r2_score(y_train, y_train_pred))
print('testing R2:', metrics.r2_score(y_test, y_test_pred))

In [None]:
import shap
#Initialize SHAP interpreter using the model with optimal parameters
explainer = shap.TreeExplainer(best_model)

#Calculate SHAP value
shap_values = explainer.shap_values(X_train)

#Retrieve the name of each row in the substructure and convert it into a list
substructure_names = substructure[0].tolist()
feature_names = []

#Add the name of the original feature
for col_name in concatenated.columns:
    feature_names.append(col_name)

# Add the infrastructure name, and original feature name to feature_name
feature_names = substructure_names  + feature_names

shap.summary_plot(shap_values, X_train, feature_names=feature_names)

In [None]:
#Molecular fingerprint analysis
sub = shap_values[:,:12]
shap.summary_plot(sub, X_train[:, :12],feature_names=substructure_names) 