# Libraries

In [1]:
# import pandas for data wrangling
import pandas as pd
import numpy as np

#xgboost and sklearn for learning and validation
import xgboost as xgb
from sklearn.metrics import accuracy_score
from xgboost import XGBRegressor
from xgboost import plot_importance
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import StratifiedKFold, cross_val_score

# import packages for hyperparameters tuning
from hyperopt import tpe, STATUS_OK, Trials, hp, fmin, STATUS_OK, space_eval
from hyperopt.pyll.base import scope

import matplotlib.pyplot as plt
import seaborn as sns
import random

In [2]:
# Set the seed
seed_value = 42
np.random.seed(seed_value)
random.seed(seed_value)

out='results/xgboost/'

# Dataset

In [3]:
dataset = pd.read_csv('data/HSP_descriptors_NO_ERRS_ZEROS.csv',index_col=[0] )
dataset


Unnamed: 0,al,CAS,smiles,ABC,ABCGG,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,...,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2,δD,δP,δH
0,"1,1,1,2-Tetrachloroethane",b'630-20-6',ClCC(Cl)(Cl)Cl,4.012290,4.284816,5.818626,2.074313,4.148627,5.818626,0.969771,...,20.736383,28,3,24.0,22.0,4.312500,1.375000,18.0,4.4,4.2
1,"1,1,1-Trichloroethane",b'71-55-6',CC(Cl)(Cl)Cl,3.464102,3.464102,4.000000,2.000000,4.000000,4.000000,0.800000,...,16.491254,16,0,20.0,16.0,4.062500,1.000000,16.8,4.3,2.0
2,"1,1,1-Trifluoroethane",b'420-46-2',CC(F)(F)F,3.464102,3.464102,4.000000,2.000000,4.000000,4.000000,0.800000,...,10.502336,16,0,20.0,16.0,4.062500,1.000000,14.6,10.0,0.0
3,"1,1,2,2-Tetrabromoethane",b'79-27-6',BrC(Br)C(Br)Br,3.932653,4.244375,6.000000,2.000000,4.000000,6.000000,1.000000,...,42.711125,29,4,22.0,21.0,4.222222,1.444444,21.0,7.0,8.2
4,"1,1,2,2-Tetrachloroethane",b'79-34-5',ClC(Cl)C(Cl)Cl,3.932653,4.244375,6.000000,2.000000,4.000000,6.000000,1.000000,...,20.736383,29,4,22.0,21.0,4.222222,1.444444,18.8,5.1,5.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1187,Quinine,b'130-95-0',[H][C@@]1([C@@H](C2=CC=NC3=CC=C(C=C23)OC)O)C[C...,18.958632,14.851657,32.595631,2.494948,4.989896,32.595631,1.358151,...,6.753829,1286,42,132.0,161.0,7.000000,5.333333,19.0,6.6,11.0
1188,Sulfur Dioxide,b'9/5/7446',O=S=O,1.414214,1.414214,2.828427,1.414214,2.828427,2.828427,0.942809,...,21.320633,4,0,6.0,4.0,2.250000,1.000000,15.8,8.4,10.0
1189,Thionyl Chloride,b'9/7/7719',O=S(Cl)Cl,2.449490,2.449490,3.464102,1.732051,3.464102,3.464102,0.866025,...,29.476173,9,0,12.0,9.0,3.111111,1.000000,16.9,6.4,6.1
1190,Triethylene Glycol Monooleyl Ether,b'5274-66-8',COCCOCCOCCO,7.071068,6.765664,13.191508,1.931852,3.863703,13.191508,1.199228,...,6.077958,220,8,38.0,36.0,4.250000,3.000000,16.0,3.1,8.4


In [8]:
X = dataset.iloc[:,3:-3]
y = dataset.iloc[:,-3:]

y_D = y['δD']
y_P = y['δP']
y_H = y['δH']

# Train, test, val split

In [9]:
from sklearn.model_selection import train_test_split

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=seed_value)

y_train_val_D,y_train_val_P,y_train_val_H = y_train_val['δD'],y_train_val['δP'],y_train_val['δH']
y_test_D,y_test_P,y_test_H = y_test['δD'],y_test['δP'],y_test['δH']


# Additional functions

In [68]:
# number of features(descriptors) to keep
n_features=10
out_f=out+str(n_features)+'/' #output folder: /results/xgboost/{n_features}/
print(out_f)

#function to extract the most important features from the xgboost model
def to_keep(X,model,n_features=n_features):
    # select the best features for the current training set
    fi=pd.DataFrame()
    fi['columns']=X.columns
    fi['importances'] = model.best_estimator_.feature_importances_
    fi.sort_values(by='importances',ascending=False,inplace=True)

    feets_to_keep = fi[:n_features]['columns']

    return feets_to_keep

#function to retrieve best model and its statistics
def get_best(best,X,y,X_train,y_train,X_test,y_test):

    best['max_depth']=int(best['max_depth'])
    best_model = XGBRegressor(**best)
    best_model.fit(X_train,y_train)

    predictions = best_model.predict(X)
    test_predictions = best_model.predict(X_test)

    results = { "best": best,
                "RMSE all" :mean_squared_error(y,predictions,squared=False),
               "R2 all":r2_score(y,predictions),
               "RMSE test": mean_squared_error(y_test,test_predictions,squared=False),
               "R2 test": r2_score(y_test,test_predictions) }

    return results

/notebooks/NEW_FILTERED/70/


# Training

In [None]:
from sklearn.model_selection import RandomizedSearchCV, KFold

# Search space for the weaker XGBoost model used for feature selection

model1_params = {
    # Maximum depth of the tree, increasing it increases the model complexity.
    "max_depth": [4,5,6],
    # Percentage of columns to be randomly samples for each tree.
    "colsample_bytree": [0.3,0.5,0.8],
    # reg_alpha provides l1 regularization to the weight, higher values result in more conservative models
    "reg_alpha": [0.1, 1, 10, 100],
    # reg_lambda provides l2 regularization to the weight, higher values result in more conservative models
    "reg_lambda": [0.1, 1, 10, 100]}

nsplits1=5
# define the outer cross-validation loop
outer_cv = KFold(n_splits=nsplits1, shuffle=True)

nsplits2=3
# define the inner cross-validation loop
inner_cv = KFold(n_splits=nsplits2, shuffle=True)

# define the weaker models for each HSP parameter
fmd=XGBRegressor(tree_method='gpu_hist',seed=seed_value)
fmp=XGBRegressor(tree_method='gpu_hist',seed=seed_value)
fmh=XGBRegressor(tree_method='gpu_hist',seed=seed_value)

# Define the search space for weaker models (inner cross validation)
gsd = RandomizedSearchCV(fmd, param_distributions=model1_params, cv=inner_cv, scoring='neg_mean_squared_error')
gsp = RandomizedSearchCV(fmp, param_distributions=model1_params, cv=inner_cv, scoring='neg_mean_squared_error')
gsh = RandomizedSearchCV(fmh, param_distributions=model1_params, cv=inner_cv, scoring='neg_mean_squared_error')

# "stronger" model search space

space={'max_depth': scope.int(hp.quniform("max_depth", 3, 20, 1)),
        'gamma': hp.uniform ('gamma', 0,20),
        'reg_alpha' : hp.quniform('reg_alpha', 0,180,1),
        'reg_lambda' : hp.uniform('reg_lambda', 0,1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1), #fraction of columns selected for each tree
        'min_child_weight' : hp.quniform('min_child_weight', 0, 20, 1),
        'n_estimators': 1000,
        'seed':seed_value
    }       # 'seed': 0

outer_scores = {"D":[],"P":[],"H":[]}

max_evals=500 # maximum number of evaluations for the hyperopt search

i=1

###### TRAINING LOOP #######
for train_ix, val_ix in outer_cv.split(X_train_val):
    
    print(f" CV:{i}/{nsplits1}")

    # split data into train and val sets
    X_train, X_val = X_train_val.iloc[train_ix], X_train_val.iloc[val_ix]
    y_train, y_val = y_train_val.iloc[train_ix], y_train_val.iloc[val_ix]

    y_train_D = y_train['δD']
    y_train_P = y_train['δP']
    y_train_H = y_train['δH']

    y_val_D = y_val['δD']
    y_val_P = y_val['δP']
    y_val_H = y_val['δH']

    # fit the feature selection grid search on the current training set
    gsd.fit(X_train, y_train_D)
    gsp.fit(X_train, y_train_P)
    gsh.fit(X_train, y_train_H)

    # get the most important n_features
    D_to_keep = to_keep(X_train,gsd)
    P_to_keep = to_keep(X_train,gsp)
    H_to_keep = to_keep(X_train,gsh)
    
    # Use the features for training stronger models
    X_D = X_train_val[D_to_keep]
    X_P = X_train_val[P_to_keep]
    X_H = X_train_val[H_to_keep]

    X_train_D,X_train_P,X_train_H = X_train[D_to_keep],X_train[P_to_keep],X_train[H_to_keep]
    X_val_D,X_val_P,X_val_H = X_val[D_to_keep], X_val[P_to_keep], X_val[H_to_keep] 

    ### DEFINE THE HYPEROPT OBJECTIVES FOR EACH PARAMETER
    
    def hyperparameter_tuning_D(space):

        model = xgb.XGBRegressor(**space, tree_method='gpu_hist', eval_metric="rmse",
                early_stopping_rounds=100)

        #Define evaluation datasets.
        evaluation = [(X_train_D, y_train_D), (X_val_D, y_val_D)]

        #Fit the model. Define evaluation sets, early_stopping_rounds, and eval_metric.
        model.fit(X_train_D, y_train_D,
                eval_set=evaluation,verbose=False)

        #Obtain prediction and rmse score.
        pred = model.predict(X_val_D)
        rmse = mean_squared_error(y_val_D, pred, squared=False)
        r2 = r2_score(y_val_D,pred)
        # print ("RMSE:", rmse, 'R2: ', r2)

        #Specify what the loss is for each model.
        return {'loss':rmse, 'status': STATUS_OK, 'model': model}

    def hyperparameter_tuning_P(space):

        model = xgb.XGBRegressor(**space, tree_method='gpu_hist', eval_metric="rmse",
                early_stopping_rounds=100)

        #Define evaluation datasets.
        evaluation = [(X_train_P, y_train_P), (X_val_P, y_val_P)]

        #Fit the model. Define evaluation sets, early_stopping_rounds, and eval_metric.
        model.fit(X_train_P, y_train_P,
                eval_set=evaluation,verbose=False)

        #Obtain prediction and rmse score.
        pred = model.predict(X_val_P)
        rmse = mean_squared_error(y_val_P, pred, squared=False)
        r2 = r2_score(y_val_P,pred)
        #print ("RMSE:", rmse, 'R2: ', r2)

        #Specify what the loss is for each model.
        return {'loss':rmse, 'status': STATUS_OK, 'model': model}

    def hyperparameter_tuning_H(space):

        model = xgb.XGBRegressor(**space, tree_method='gpu_hist', eval_metric="rmse",
                early_stopping_rounds=100)

        #Define evaluation datasets.
        evaluation = [(X_train_H, y_train_H), (X_val_H, y_val_H)]

        #Fit the model. Define evaluation sets, early_stopping_rounds, and eval_metric.
        model.fit(X_train_H, y_train_H,
                eval_set=evaluation,verbose=False)

        #Obtain prediction and rmse score.
        pred = model.predict(X_val_H)
        rmse = mean_squared_error(y_val_H, pred, squared=False)
        r2 = r2_score(y_val_H,pred)
        #print ("RMSE:", rmse, 'R2: ', r2)

        #Specify what the loss is for each model.
        return {'loss':rmse, 'status': STATUS_OK, 'model': model}

    #### TRAINING + HYPERPARAMETER OPTIMIZATION
    
    trials = Trials()
    best_D = fmin(fn=hyperparameter_tuning_D,
                space=space,
                algo=tpe.suggest,
                max_evals=max_evals,
                trials=trials)
    trials = Trials()
    best_P = fmin(fn=hyperparameter_tuning_P,
                space=space,
                algo=tpe.suggest,
                max_evals=max_evals,
                trials=trials)
    trials = Trials()
    best_H = fmin(fn=hyperparameter_tuning_H,
                space=space,
                algo=tpe.suggest,
                max_evals=max_evals,
                trials=trials)

    r_D={"res":get_best(best_D,X_D,y_train_val_D,X_train_D,y_train_D,X_val_D,y_val_D),"features":D_to_keep}  
    r_P={"res":get_best(best_P,X_P,y_train_val_P,X_train_P,y_train_P,X_val_P,y_val_P),"features":P_to_keep}
    r_H={"res":get_best(best_H,X_H,y_train_val_H,X_train_H,y_train_H,X_val_H,y_val_H),"features":H_to_keep}

    outer_scores["D"].append(r_D)
    outer_scores["P"].append(r_P)
    outer_scores["H"].append(r_H)


    i+=1
# print the cross-validation scores
print('Nested cross-validation scores:', outer_scores)


# Make dataframes with performance statistics (optional)

In [None]:
r={"D":{"RMSE all":[],"RMSE val":[],"R2 all":[],"R2 val":[]},
  "P":{"RMSE all":[],"RMSE val":[],"R2 all":[],"R2 val":[]},
  "H":{"RMSE all":[],"RMSE val":[],"R2 all":[],"R2 val":[]}}

for par in r.keys():
    for i in outer_scores[par]:
        r[par]["RMSE all"].append(i['res']['RMSE all'])
        r[par]["RMSE val"].append(i['res']['RMSE test'])
        r[par]["R2 all"].append(i['res']['R2 all'])
        r[par]["R2 val"].append(i['res']['R2 test'])
    

Cross validation results:

In [None]:
cv_df = pd.DataFrame(
   {
      "<RMSE all>": [np.mean(r["D"]["RMSE all"]),np.mean(r["P"]["RMSE all"]),np.mean(r["H"]["RMSE all"])],
      "std(RMSE all)": [np.std(r["D"]["RMSE all"]),np.std(r["P"]["RMSE all"]),np.std(r["H"]["RMSE all"])],
      "<R2 all>": [np.mean(r["D"]["R2 all"]),np.mean(r["P"]["R2 all"]),np.mean(r["H"]["R2 all"])],
       "std(R2 all)": [np.std(r["D"]["R2 all"]),np.std(r["P"]["R2 all"]),np.std(r["H"]["R2 all"])],
       "<RMSE val>": [np.mean(r["D"]["RMSE val"]),np.mean(r["P"]["RMSE val"]),np.mean(r["H"]["RMSE val"])],
      "std(RMSE val)": [np.std(r["D"]["RMSE val"]),np.std(r["P"]["RMSE val"]),np.std(r["H"]["RMSE val"])],
      "<R2 val>": [np.mean(r["D"]["R2 val"]),np.mean(r["P"]["R2 val"]),np.mean(r["H"]["R2 val"])],
       "std(R2 val)":[np.std(r["D"]["R2 val"]),np.std(r["P"]["R2 val"]),np.std(r["H"]["R2 val"])]
   },
   index=["D", "P", "H"]
)

cv_df.to_csv(out_f+f"CV_{nsplits1}fold_results.csv")

# Final training and validation

In [None]:
X_TRAIN, X_VAL, y_TRAIN, y_VAL = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=seed_value)

y_TRAIN_D,y_TRAIN_P,y_TRAIN_H = y_TRAIN['δD'],y_TRAIN['δP'],y_TRAIN['δH']
y_VAL_D,y_VAL_P,y_VAL_H = y_VAL['δD'],y_VAL['δP'],y_VAL['δH']

In [None]:
# Find the best model and features for the HSP parameter :

best_D=outer_scores["D"][np.argmax(r["D"]["R2 val"])]['res']['best']
D_features=outer_scores["D"][np.argmax(r["D"]["R2 val"])]['features']

#use those features
X_TRAIN_D, X_VAL_D = X_TRAIN[D_features], X_VAL[D_features]

# define the model
best_D_model = XGBRegressor(**best_D,tree_method='gpu_hist', eval_metric="rmse", early_stopping_rounds=100 )

evaluation = [(X_TRAIN_D, y_TRAIN_D), (X_VAL_D, y_VAL_D)]
#MODEL FIT
best_D_model.fit(X_TRAIN_D, y_TRAIN_D,eval_set=evaluation,verbose=False)


# Evaluate the performance and print out the results:

predictions_D = best_D_model.predict(X[D_features])
test_predictions_D = best_D_model.predict(X_test[D_features])
train_val_predictions_D = best_D_model.predict(X_train_val[D_features])

best_D_RMSE_all = mean_squared_error(y_D,predictions_D,squared=False)
best_D_r2_all = r2_score(y_D,predictions_D)

best_D_RMSE_test = mean_squared_error(y_test_D,test_predictions_D,squared=False)
best_D_r2_test = r2_score(y_test_D,test_predictions_D)

best_D_RMSE_train_val = mean_squared_error(y_train_val_D,train_val_predictions_D,squared=False)
best_D_r2_train_val = r2_score(y_train_val_D,train_val_predictions_D)

print("D:")
print("RMSE all:", best_D_RMSE_all)
print("R2 all:", best_D_r2_all)

print("RMSE test:", best_D_RMSE_test)
print("R2 test:", best_D_r2_test)

print("RMSE train-val:", best_D_RMSE_train_val)
print("R2 train-val:", best_D_r2_train_val)


# Do the same for P and H parameters

best_P=outer_scores["P"][np.argmax(r["P"]["R2 val"])]['res']['best']
P_features=outer_scores["P"][np.argmax(r["P"]["R2 val"])]['features']

X_TRAIN_P, X_VAL_P = X_TRAIN[P_features], X_VAL[P_features]

best_P_model = XGBRegressor(**best_P,tree_method='gpu_hist', eval_metric="rmse", early_stopping_rounds=100 )

evaluation = [(X_TRAIN_P, y_TRAIN_P), (X_VAL_P, y_VAL_P)]
#MODEL FIT
best_P_model.fit(X_TRAIN_P, y_TRAIN_P,eval_set=evaluation,verbose=False)


predictions_P = best_P_model.predict(X[P_features])
test_predictions_P = best_P_model.predict(X_test[P_features])
train_val_predictions_P = best_P_model.predict(X_train_val[P_features])

best_P_RMSE_all = mean_squared_error(y_P,predictions_P,squared=False)
best_P_r2_all = r2_score(y_P,predictions_P)

best_P_RMSE_test = mean_squared_error(y_test_P,test_predictions_P,squared=False)
best_P_r2_test = r2_score(y_test_P,test_predictions_P)

best_P_RMSE_train_val = mean_squared_error(y_train_val_P,train_val_predictions_P,squared=False)
best_P_r2_train_val = r2_score(y_train_val_P,train_val_predictions_P)

print("P:")
print("RMSE all:", best_P_RMSE_all)
print("R2 all:", best_P_r2_all)

print("RMSE test:", best_P_RMSE_test)
print("R2 test:", best_P_r2_test)

print("RMSE train-val:", best_P_RMSE_train_val)
print("R2 train-val:", best_P_r2_train_val)



best_H=outer_scores["H"][np.argmax(r["H"]["R2 val"])]['res']['best']
H_features=outer_scores["H"][np.argmax(r["H"]["R2 val"])]['features']

X_TRAIN_H, X_VAL_H = X_TRAIN[H_features], X_VAL[H_features]
best_H_model = XGBRegressor(**best_H,tree_method='gpu_hist', eval_metric="rmse", early_stopping_rounds=100)


evaluation = [(X_TRAIN_H, y_TRAIN_H), (X_VAL_H, y_VAL_H)]
#MODEL FIT
best_H_model.fit(X_TRAIN_H, y_TRAIN_H,eval_set=evaluation,verbose=False)


predictions_H = best_H_model.predict(X[H_features])
test_predictions_H = best_H_model.predict(X_test[H_features])
train_val_predictions_H = best_H_model.predict(X_train_val[H_features])

best_H_RMSE_all = mean_squared_error(y_H,predictions_H,squared=False)
best_H_r2_all = r2_score(y_H,predictions_H)

best_H_RMSE_test = mean_squared_error(y_test_H,test_predictions_H,squared=False)
best_H_r2_test = r2_score(y_test_H,test_predictions_H)

best_H_RMSE_train_val = mean_squared_error(y_train_val_H,train_val_predictions_H,squared=False)
best_H_r2_train_val = r2_score(y_train_val_H,train_val_predictions_H)

print("D:")
print("RMSE all:", best_H_RMSE_all)
print("R2 all:", best_H_r2_all)

print("RMSE test:", best_H_RMSE_test)
print("R2 test:", best_H_r2_test)

print("RMSE train-val:", best_H_RMSE_train_val)
print("R2 train-val:", best_H_r2_train_val)




# Save the models:

In [None]:

best_P_model.save_model(out_f+f"MODEL_P_XGBOOST.json")
best_D_model.save_model(out_f+f"MODEL_D_XGBOOST.json")
best_H_model.save_model(out_f+f"MODEL_H_XGBOOST.json")


# Optional dataframes

In [None]:
best_cv_df = pd.DataFrame(
   {
      "RMSE all": [best_D_RMSE_all,best_P_RMSE_all,best_H_RMSE_all],
      
      "R2 all": [best_D_r2_all,best_P_r2_all,best_H_r2_all],

      "RMSE train-val": [best_D_RMSE_train_val,best_P_RMSE_train_val,best_H_RMSE_train_val],

      "R2 train-val": [best_D_r2_train_val,best_P_r2_train_val,best_H_r2_train_val],
       
      "RMSE test": [best_D_RMSE_test,best_P_RMSE_test,best_H_RMSE_test],
      
      "R2 test": [best_D_r2_test,best_P_r2_test,best_H_r2_test]},
       
   index=["D", "P", "H"] )

best_cv_df.to_csv(out_f+"best_CV_results.csv")

In [None]:
y_train = {'D':{'true':y_train_val_D.values,'pred':train_val_predictions_D},
           'P':{'true':y_train_val_P.values,'pred':train_val_predictions_P},
           'H':{'true':y_train_val_H.values,'pred':train_val_predictions_H}
           }
y_test = {'D':{'true':y_test_D.values,'pred':test_predictions_D},
           'P':{'true':y_test_P.values,'pred':test_predictions_P},
           'H':{'true':y_test_H.values,'pred':test_predictions_H}
           }
y_train_df=pd.DataFrame.from_dict({(i,j): y_train[i][j] 
                           for i in y_train.keys() 
                           for j in y_train[i].keys()},
                       orient='index').T
y_train_df.to_csv(out_f+"y_train_XGBOOST_xgboost_nfeats{n_features}.csv")

y_test_df=pd.DataFrame.from_dict({(i,j): y_test[i][j] 
                           for i in y_test.keys() 
                           for j in y_test[i].keys()},
                       orient='index').T
y_test_df.to_csv(out_f+f"y_test_XGBOOST_xgboost_nfeats{n_features}.csv")

Feature importances:

In [None]:
D_IMP=pd.DataFrame({'Name': D_to_keep,'Importance':best_D_model.feature_importances_})
P_IMP=pd.DataFrame({'Name': P_to_keep,'Importance':best_P_model.feature_importances_})
H_IMP=pd.DataFrame({'Name': H_to_keep,'Importance':best_H_model.feature_importances_})

D_IMP.to_csv(out_f+f'D_XGBOOST_NONONE_xgboost_feature_importances_nfeats{n_features}.csv')
P_IMP.to_csv(out_f+f'P_XGBOOST_NONONE_xgboost_feature_importances_nfeats{n_features}.csv')
H_IMP.to_csv(out_f+f'H_XGBOOST_NONONE_xgboost_feature_importances_nfeats{n_features}.csv')

In [None]:

y_test_means_stds_df = pd.DataFrame(
   {
      "<TRUE>": [ np.mean(y_test['D']['true']),np.mean(y_test['P']['true']),np.mean(y_test['H']['true']) ],
      "std(True)": [np.std(y_test['D']['true']),np.std(y_test['P']['true']),np.std(y_test['H']['true'])],
      "<PREDICTED>": [ np.mean(y_test['D']['pred']),np.mean(y_test['P']['pred']),np.mean(y_test['H']['pred']) ],
       "std(PREDICTED)": [np.std(y_test['D']['pred']),np.std(y_test['P']['pred']),np.std(y_test['H']['pred'])]
       },
   index=["D", "P", "H"]
)

y_test_means_stds_df.to_csv(out_f+'y_test_means_stds.csv')

y_train_means_stds_df = pd.DataFrame(
   {
      "<TRUE>": [ np.mean(y_train['D']['true']),np.mean(y_train['P']['true']),np.mean(y_train['H']['true']) ],
      "std(True)": [np.std(y_train['D']['true']),np.std(y_train['P']['true']),np.std(y_train['H']['true'])],
      "<PREDICTED>": [ np.mean(y_train['D']['pred']),np.mean(y_train['P']['pred']),np.mean(y_train['H']['pred']) ],
       "std(PREDICTED)": [np.std(y_train['D']['pred']),np.std(y_train['P']['pred']),np.std(y_train['H']['pred'])]
       },
   index=["D", "P", "H"]
)

y_train_means_stds_df.to_csv(out_f+'y_train_means_stds.csv')

Optional figure:

In [None]:
ynp=pd.DataFrame.to_numpy(y)
predictions = [predictions_D,predictions_P,predictions_H]
sns.set_style('whitegrid')
sns.set_context("paper")
fig,axs = plt.subplots(nrows=1,ncols=3,figsize=(20,5))
fig.suptitle('Hansen solubility parameters XGBOOST predictions and true values')
colors = ['b','y','g']
for i in range(3):
    sns.scatterplot(x=ynp[:,i],y=predictions[i],ax=axs[i],color=colors[i])
    sns.lineplot(x=ynp[:,i],y=ynp[:,i],ax=axs[i],color='r',alpha=0.8, label=f'{y.columns[i]}-{y.columns[i]}')
    axs[i].set_xlabel(y.columns[i])
    axs[i].set_ylabel(f'{y.columns[i]} XGBOOST prediction')

#fig.savefig(out_f+f'XGBOOST_xgboost_NONONE_1_{n_features}.png')