In [None]:
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor as XGBR
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from bayes_opt import BayesianOptimization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
data_ABX = pd.read_csv('ABX3_Data_screen.csv')
data_ABBX = pd.read_csv('AB1B2X6_Data_screen.csv')
data_startegy_1 = pd.read_csv('Startegy1.csv')
data_startegy_2 = pd.read_csv('Startegy2.csv')

In [None]:
X_ABX = data_ABX.drop(columns=['formation_energy_per_atom','band_gap','pretty_formula'])
X_ABBX = data_ABBX.drop(columns=['formation_energy_per_atom','band_gap','pretty_formula'])
X_startegy1 = data_startegy_1.drop(columns=['formation_energy_per_atom','band_gap','pretty_formula'])
X_startegy2 = data_startegy_2.drop(columns=['formation_energy_per_atom','band_gap','pretty_formula'])
Y_ABX_formation_e = data_ABX.loc[:,'formation_energy_per_atom']
Y_ABX_bandgap = data_ABX.loc[:,'band_gap']
Y_ABBX_formation_e = data_ABBX.loc[:,'formation_energy_per_atom']
Y_ABBX_bandgap = data_ABBX.loc[:,'band_gap']
Y_startegy1_formation_e = data_startegy_1.loc[:,'formation_energy_per_atom']
Y_startegy1_bandgap = data_startegy_1.loc[:,'band_gap']
Y_startegy2_formation_e = data_startegy_2.loc[:,'formation_energy_per_atom']
Y_startegy2_bandgap = data_startegy_2.loc[:,'band_gap']

In [None]:
# ridge
param_grid_Ridge = [{'alpha': [0.1,1.0,10]
               ,'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
              }]

# DT
def dt_cv(max_depth,max_features,max_leaf_nodes,min_samples_split,min_samples_leaf):
    dt = DecisionTreeRegressor(random_state=30
                               ,max_depth = int(max_depth)
                               ,max_features = int(max_features)
                               ,max_leaf_nodes = int(max_leaf_nodes)
                               ,min_samples_split = int(min_samples_split)
                               ,min_samples_leaf = int(min_samples_leaf)    
                               )
    r2 = cross_val_score(dt,X_sta,Y,scoring='r2',cv=10)
    return r2.mean()


pbounds_DT = dict()
pbounds_DT['max_depth'] = (10,110)
pbounds_DT['max_features'] = (20,280)
pbounds_DT['max_leaf_nodes'] = (30,300)
pbounds_DT['min_samples_split'] = (2,5)
pbounds_DT['min_samples_leaf'] = (1,10)

#RFR
def rfr_cv(n_estimators,max_depth,max_features,min_samples_split,min_samples_leaf):
    rfr = RandomForestRegressor(random_state=30
                               ,n_estimators = int(n_estimators)
                               ,max_depth = int(max_depth)
                               ,max_features = mf[int(max_features)]
                               ,min_samples_split = int(min_samples_split)
                               ,min_samples_leaf = int(min_samples_leaf)
                               ,bootstrap=True
                               )
    r2 = cross_val_score(rfr,X_sta,Y,scoring='r2',cv=10)
    return r2.mean()


mf = ['log2','sqrt']

pbounds_RFR = dict()
pbounds_RFR['n_estimators'] = (100,2000)
pbounds_RFR['max_depth'] = (10,110)
pbounds_RFR['max_features'] = (0,1.99)
pbounds_RFR['min_samples_split'] = (2,5)
pbounds_RFR['min_samples_leaf'] = (1,11)




#SVR    BayesianOptimization
from bayes_opt import BayesianOptimization
def svr_cv(epsilon,gamma,C):
    svr = SVR(kernel = 'rbf'
             ,epsilon = epsilon
             ,gamma = gamma
             ,C = int(C)
             )
    r2 = cross_val_score(svr,X_sta,Y,scoring='r2',cv=10)
    return r2.mean()

pbounds_SVR = dict()
pbounds_SVR['epsilon'] = (0.005,0.5)
pbounds_SVR['gamma'] = (1e-6,0.01)
pbounds_SVR['C'] = (1,200)


# XGBoost
def xgb_cv(n_estimators,eta,reg_alpha,reg_lambda,gamma,max_depth,colsample_bytree,colsample_bylevel,colsample_bynode,min_child_weight):
    xgb = XGBR(random_state=30
               ,n_estimators = int(n_estimators)
               ,eta = eta
               ,reg_alpha = reg_alpha
               ,reg_lambda = reg_lambda
               ,gamma = 0
               ,max_depth = int(max_depth)
               ,colsample_bytree = colsample_bytree
               ,colsample_bylevel = colsample_bylevel
               ,colsample_bynode = colsample_bynode
               ,min_child_weight = min_child_weight
              )
    r2 = cross_val_score(xgb,X_sta,Y,scoring='r2',cv=10)
    return r2.mean()

pbounds_XGB = dict()
pbounds_XGB['n_estimators'] = (10,500)
pbounds_XGB['eta'] = (0.01,0.2)
pbounds_XGB['reg_alpha'] = (0,5)
pbounds_XGB['reg_lambda'] = (0,2) 
pbounds_XGB['gamma'] = (0,5)
pbounds_XGB['max_depth'] = (3,10)
pbounds_XGB['colsample_bytree'] = (0.5,1) 
pbounds_XGB['colsample_bylevel'] = (0.5,1)
pbounds_XGB['colsample_bynode'] = (0.5,1) 
pbounds_XGB['min_child_weight'] = (1,10) 

#MLPR
#（1 layer）
def mlpr_1_cv(hidden_layer_sizes,activation,alpha,learning_rate_init):
    mlpr = MLPRegressor(random_state=30
                      ,hidden_layer_sizes = tuple(map(int,tuple((hidden_layer_sizes,))))
                      ,activation = act[int(activation)]
                      ,learning_rate_init = learning_rate_init
                      ,max_iter = 2500
                      ,alpha = alpha               
                       )
    r2 = cross_val_score(mlpr,X_sta,Y,cv=10)
    return r2.mean()

act = ['tanh','relu','identity','logistic']


pbounds_MLPR_1 = dict()
pbounds_MLPR_1['hidden_layer_sizes'] = (10,200)
pbounds_MLPR_1['activation'] = (0,3.99)
pbounds_MLPR_1['learning_rate_init'] = (0.001,0.01)
pbounds_MLPR_1['alpha'] = (0.00001,0.1)

#（2 layers）
def mlpr_2_cv(hidden_layer_sizes_1,hidden_layer_sizes_2,activation,alpha,learning_rate_init):
    mlpr = MLPRegressor(random_state=30
                      ,hidden_layer_sizes = tuple(map(int,tuple((hidden_layer_sizes_1, hidden_layer_sizes_2))))
                      ,activation = act[int(activation)]
                      ,learning_rate_init = learning_rate_init
                      ,max_iter = 2500
                      ,alpha = alpha               
                       )
    r2 = cross_val_score(mlpr,X_sta,Y,cv=10)
    return r2.mean()


pbounds_MLPR_2 = dict()
pbounds_MLPR_2['hidden_layer_sizes_1'] = (10,200)
pbounds_MLPR_2['hidden_layer_sizes_2'] = (10,200)
pbounds_MLPR_2['activation'] = (0,3.99)
pbounds_MLPR_2['learning_rate_init'] = (0.001,0.01)
pbounds_MLPR_2['alpha'] = (0.00001,0.1)

#（3 layers）
def mlpr_3_cv(hidden_layer_sizes_1,hidden_layer_sizes_2,hidden_layer_sizes_3,activation,alpha,learning_rate_init):
    mlpr = MLPRegressor(random_state=30
                      ,hidden_layer_sizes = tuple(map(int,tuple((hidden_layer_sizes_1, hidden_layer_sizes_2, hidden_layer_sizes_3)))) 
                      ,activation = act[int(activation)]
                      ,learning_rate_init = learning_rate_init
                      ,max_iter = 2500
                      ,alpha = alpha               
                       )
    r2 = cross_val_score(mlpr,X_sta,Y,cv=10)
    return r2.mean()

pbounds_MLPR_3 = dict()
pbounds_MLPR_3['hidden_layer_sizes_1'] = (10,200)
pbounds_MLPR_3['hidden_layer_sizes_2'] = (10,200)
pbounds_MLPR_3['hidden_layer_sizes_3'] = (10,200)
pbounds_MLPR_3['activation'] = (0,3.99)
pbounds_MLPR_3['learning_rate_init'] = (0.001,0.01)
pbounds_MLPR_3['alpha'] = (0.00001,0.1)

In [None]:
X = 
Y = 

from sklearn.preprocessing import StandardScaler
col = X.columns.tolist()
X_sta = np.copy(X)
X_sta = pd.DataFrame(X_sta)
X_sta.columns = col
X_sta.loc[:,col] = StandardScaler().fit_transform(X_sta.loc[:,col])
X_sta.columns = X_sta.columns.str.translate("".maketrans({"[":"{", "]":"}","<":"^"}))

In [None]:
# ridge

clf = GridSearchCV(Ridge(random_state=30),param_grid_Ridge,scoring='r2',cv=10,n_jobs=-1)
clf.fit(X_sta,Y)
clf.best_score_
clf.best_params_

In [None]:
optimizer = BayesianOptimization(f = dt_cv,
                                pbounds = pbounds_DT,
                                random_state = 30
                                )
optimizer.maximize(n_iter=100)

print(optimizer.res)
print(optimizer.max)

In [None]:
# DT

optimizer = BayesianOptimization(f = dt_cv,
                                pbounds = pbounds_DT,
                                random_state = 30
                                )
optimizer.maximize(n_iter=100)

print(optimizer.res)
print(optimizer.max)

In [None]:
#RFR

optimizer = BayesianOptimization(f = rfr_cv,
                                pbounds = pbounds_RFR,
                                random_state = 30
                                )
optimizer.maximize(n_iter=100)

print(optimizer.res)
print(optimizer.max)

In [None]:
#SVR    BayesianOptimization

optimizer = BayesianOptimization(f = svr_cv,
                                pbounds = pbounds_SVR,
                                random_state = 30
                                )
optimizer.maximize(n_iter=100)

print(optimizer.res)
print(optimizer.max)

In [None]:
#MLPR

optimizer = BayesianOptimization(f = mlpr_1_cv,
                                pbounds = pbounds_MLPR_1,
                                random_state = 30
                                )
optimizer.maximize(n_iter=100)

print(optimizer.res)
print(optimizer.max)

In [None]:
optimizer = BayesianOptimization(f = mlpr_2_cv,
                                pbounds = pbounds_MLPR_2,
                                random_state = 30
                                )
optimizer.maximize(n_iter=100)

print(optimizer.res)
print(optimizer.max)

In [None]:
optimizer = BayesianOptimization(f = mlpr_3_cv,
                                pbounds = pbounds_MLPR_3,
                                random_state = 30
                                )
optimizer.maximize(n_iter=100)

print(optimizer.res)
print(optimizer.max)