In [2]:
import numpy as np
import pandas as pd
import xgboost
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from scipy.stats import uniform
from scipy.stats import randint
from sklearn.metrics import mean_squared_error
from sklearn import preprocessing
import warnings

In [2]:
data = pd.read_csv('Pt_nanoparticle_subset.csv')
data.head()

Unnamed: 0,ID,N_total,N_bulk,N_surface,R_min,R_max,R_diff,R_avg,R_std,R_skew,...,q6q6_T8,q6q6_T9,q6q6_T10,q6q6_T11,q6q6_T12,Surf_defects_mol,Surf_micros_mol,Surf_facets_mol,Total_E,Formation_E
0,1,1732,1129,603,15.118,18.9872,3.8692,17.2471,0.6262,-0.1372,...,64,88,52,43,40,0.0,121.5956,267.1222,-9237.3055,756.3345
1,2,1755,1150,605,15.3043,20.0506,4.7463,17.3469,0.6597,0.2938,...,69,74,44,30,50,0.6383,117.4488,266.1747,-9335.3608,790.9892
2,3,4672,3456,1216,21.5357,27.5179,5.9822,24.5075,0.9193,-0.0729,...,261,258,243,182,696,0.4796,84.8806,205.7275,-25287.565,1669.875
3,4,6823,5122,1701,23.3351,35.982,12.6469,28.0471,1.9581,0.4768,...,417,488,419,400,1730,0.6567,87.3462,189.9616,-37049.171,2319.539
4,5,10733,8349,2384,25.7149,40.8339,15.119,32.7271,2.122,-0.3342,...,683,782,685,713,4006,0.6262,83.9156,163.6563,-58602.25,3327.16


In [3]:
X = data.iloc[:,1:-5]
y_ba = data.iloc[:,-5]
y_bb = data.iloc[:,-4]
y_bc = data.iloc[:,-3]
y_bd = data.iloc[:,-2]
y_be = data.iloc[:,-1]

In [4]:
min_max_scaler = preprocessing.MinMaxScaler()
X_scaled = pd.DataFrame(min_max_scaler.fit_transform(X.values),columns=X.columns)
X_scaled.head()

Unnamed: 0,N_total,N_bulk,N_surface,R_min,R_max,R_diff,R_avg,R_std,R_skew,R_kurt,...,q6q6_T3,q6q6_T4,q6q6_T5,q6q6_T6,q6q6_T7,q6q6_T8,q6q6_T9,q6q6_T10,q6q6_T11,q6q6_T12
0,0.099987,0.082954,0.159792,0.384908,0.183569,0.046382,0.32301,0.016219,0.529445,0.089511,...,0.422857,0.376771,0.252465,0.134104,0.065704,0.032016,0.030705,0.052632,0.033541,0.003384
1,0.101455,0.084574,0.160442,0.390694,0.203226,0.06919,0.326296,0.020016,0.631094,0.105145,...,0.405714,0.427762,0.22288,0.128324,0.07509,0.034517,0.02582,0.044534,0.023401,0.00423
2,0.287583,0.26252,0.358883,0.584235,0.341265,0.101329,0.56209,0.049439,0.54461,0.090103,...,0.897143,0.827195,0.546351,0.301734,0.204332,0.130565,0.090021,0.245951,0.141966,0.058883
3,0.424834,0.39108,0.516401,0.640123,0.49773,0.274641,0.678646,0.167177,0.674253,0.094952,...,1.0,1.0,0.690335,0.517919,0.358845,0.208604,0.170272,0.424089,0.312012,0.146362
4,0.674324,0.640096,0.738227,0.714037,0.587421,0.338927,0.832755,0.185753,0.482984,0.098917,...,0.937143,0.963173,0.863905,0.590751,0.519134,0.341671,0.272854,0.69332,0.556162,0.338917


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_be, test_size=0.2, random_state=0)

In [6]:
import pickle as pkl

#to save it
with open("train_test.pkl", "wb") as f:
    pkl.dump([X_train,X_test,y_train,y_test], f)

In [4]:
import pickle as pkl

#to load it
with open("train_test.pkl", "rb") as f:
    X_train,X_test,y_train,y_test = pkl.load(f)

In [6]:
# hps = dict(learning_rate=uniform(loc=0,scale=1), colsample_bytree=uniform(loc=0,scale=1), 
#            max_depth=np.arange(5,51), alpha=list(np.arange(0,11)), gamma=uniform(loc=0,scale=10))
# hps = dict(learning_rate=uniform(loc=0,scale=1), colsample_bytree=uniform(loc=0,scale=1), 
#            max_depth=np.arange(10,31), alpha=list(np.arange(0,11)))

In [5]:
warnings.filterwarnings(action='ignore', category=UserWarning)
hps = dict(learning_rate=uniform(loc=0.1,scale=0.5), colsample_bytree=uniform(loc=0.1,scale=0.9), 
           max_depth=[3,4,5,6,7,8], alpha=list(np.arange(0,11)), n_estimators=np.arange(50,251))

In [19]:
xg_reg = xgboost.XGBRegressor(objective ='reg:squarederror')
xgb = RandomizedSearchCV(xg_reg, hps, random_state=0, n_iter=500)
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_train)
rmse_train = np.sqrt(mean_squared_error(y_train, y_pred))
y_test_pred = xgb.predict(X_test)
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
print(f'best parameters: {xgb.best_params_}')
print(f'RMSE_train: {rmse_train}')
print(f'RMSE_test: {rmse_test}')
print(f'train score: {xgb.score(X_train, y_train)}')
print(f'test score: {xgb.score(X_test, y_test)}')

best parameters: {'alpha': 6, 'colsample_bytree': 0.4317261455166558, 'learning_rate': 0.16845013584279947, 'max_depth': 4, 'n_estimators': 198}
RMSE_train: 5.602268548572958
RMSE_test: 58.13968965317432
train score: 0.9999794125837332
test score: 0.9978686361948852


In [18]:
from sklearn.model_selection import SequenceSearchCV
xg_reg = xgboost.XGBRegressor(objective ='reg:squarederror')
xgb = SequenceSearchCV(xg_reg, hps, n_iter=500)
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_train)
rmse_train = np.sqrt(mean_squared_error(y_train, y_pred))
y_test_pred = xgb.predict(X_test)
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
print(f'best parameters: {xgb.best_params_}')
print(f'RMSE_train: {rmse_train}')
print(f'RMSE_test: {rmse_test}')
print(f'train score: {xgb.score(X_train, y_train)}')
print(f'test score: {xgb.score(X_test, y_test)}')

best parameters: {'learning_rate': 0.117578125, 'colsample_bytree': 0.5183593750000001, 'max_depth': 3, 'alpha': 6, 'n_estimators': 241}
RMSE_train: 13.65881216444451
RMSE_test: 54.847085363824824
train score: 0.9998776229824589
test score: 0.9981032098960253


In [15]:
from sklearn.model_selection import HaltonSearchCV
xg_reg = xgboost.XGBRegressor(objective ='reg:squarederror')
xgb = HaltonSearchCV(xg_reg, hps, n_iter=500)
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_train)
rmse_train = np.sqrt(mean_squared_error(y_train, y_pred))
y_test_pred = xgb.predict(X_test)
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
print(f'best parameters: {xgb.best_params_}')
print(f'RMSE_train: {rmse_train}')
print(f'RMSE_test: {rmse_test}')
print(f'train score: {xgb.score(X_train, y_train)}')
print(f'test score: {xgb.score(X_test, y_test)}')

best parameters: {'learning_rate': 0.1390625, 'colsample_bytree': 0.5444444444444444, 'max_depth': 3, 'alpha': 8, 'n_estimators': 182}
RMSE_train: 15.072459327747035
RMSE_test: 55.44259087169345
train score: 0.9998509807955089
test score: 0.9980617972660085
