In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor as XGBR
from sklearn.neural_network import MLPRegressor

In [None]:
data = pd.read_csv('Startegy1.csv')    #运行完改Startegy2

In [None]:
df = data
X = df.drop(columns = ['pretty_formula','formation_energy_per_atom','band_gap'])
Y_bg = df.loc[:,'band_gap']
Y_fe = df.loc[:,'formation_energy_per_atom']

In [None]:
from sklearn.preprocessing import StandardScaler
col = X.columns.tolist()
X_sta = np.copy(X)
X_sta = pd.DataFrame(X_sta)
X_sta.columns = col
X_sta.loc[:,col] = StandardScaler().fit_transform(X_sta.loc[:,col])
X_sta.columns = X_sta.columns.str.translate("".maketrans({"[":"{", "]":"}","<":"^"}))

In [None]:
from sklearn.feature_selection import VarianceThreshold

selector = VarianceThreshold()    
X_var0 = selector.fit_transform(X_sta)

selector_name_index = selector.get_support(indices=True)   
fea_name = X_sta.columns.values.tolist()   
selector_name = []   

for i in selector_name_index:
    selector_name.append(fea_name[i])

X_var0 = pd.DataFrame(X_var0)
X_var0.columns = selector_name
X_var0

In [None]:
# Startegy1 model

#XGB
xgb = XGBR(random_state=30
            ,n_estimators = 382
            ,eta = 0.0263
            ,reg_alpha = 0
            ,reg_lambda = 0
            ,gamma = 0
            ,max_depth = 10
            ,colsample_bytree = 1
            ,colsample_bylevel = 0.5
            ,colsample_bynode = 1
            ,min_child_weight = 9.998
            )


#rfr
rfr = RandomForestRegressor(random_state=30
                            ,n_estimators = 944
                            ,max_depth = 17
                            ,max_features = 'sqrt'
                            ,min_samples_split = 3
                            ,min_samples_leaf = 1
                            ,bootstrap=True
                            )
#mlpr
mlpr = MLPRegressor(random_state=30
                      ,hidden_layer_sizes = (78,57,69)
                      ,activation = 'logistic'
                      ,learning_rate_init = 0.01
                      ,learning_rate = 'constant'
                      ,solver = 'adam'
                      ,max_iter = 2000
                      ,alpha = 0.1               
                       )

from sklearn.feature_selection import SelectFromModel
from sklearn.inspection import permutation_importance

fea_imp1 = xgb.fit(X_var0,Y_bg).feature_importances_
features1 = pd.DataFrame(sorted(zip(fea_imp1,X_var0.columns),reverse=True))

fea_imp2 = rfr.fit(X_var0,Y_bg).feature_importances_
features2 = pd.DataFrame(sorted(zip(fea_imp2,X_var0.columns),reverse=True))

mlpr.fit(X_var0,Y_bg)
fea_imp3 = permutation_importance(mlpr,X_var0,Y_bg,n_repeats=20)
features3 = pd.DataFrame(sorted(zip(fea_imp3.importances_mean,X_var0.columns),reverse=True))

In [None]:
X_xgb, X_rfr, X_mlpr = [], [], []
X_xgb = pd.DataFrame(X_xgb)
X_rfr = pd.DataFrame(X_rfr)
X_mlpr = pd.DataFrame(X_mlpr)

for i in range(1,features1.shape[0]+1,1):
    X_xgb.insert(loc=i-1,column=features1[1][i-1],value=X_var0.loc[:,features1[1][i-1]])
    X_rfr.insert(loc=i-1,column=features2[1][i-1],value=X_var0.loc[:,features2[1][i-1]])
    X_mlpr.insert(loc=i-1,column=features3[1][i-1],value=X_var0.loc[:,features3[1][i-1]])

In [None]:
Y = Y_bg
score_xgb = []
score_mlpr = []
score_rfr = []

for i in range(int(X_var0.shape[1]/20)):
    X_1 = X_xgb.iloc[:,0:(i+1)*20]
    X_2 = X_rfr.iloc[:,0:(i+1)*20]
    X_3 = X_mlpr.iloc[:,0:(i+1)*20]

    score1 = cross_val_score(xgb,X_1,Y,cv=10).mean()
    score_xgb.append(score1)

    score2 = cross_val_score(rfr,X_2,Y,cv=10).mean()
    score_rfr.append(score2)

    score3 = cross_val_score(mlpr,X_3,Y,cv=10).mean()
    score_mlpr.append(score3)