In [1]:
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from xgboost import XGBRegressor as XGBR
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold

warnings.filterwarnings("ignore")
plt.rcParams.update({'figure.max_open_warning': 0})

In [2]:
# read data
features = pd.read_excel(r"/opt/jupyter_data/model/feature/final_features.xlsx")
target = pd.read_excel(r"/opt/jupyter_data/model/feature/final_data.xlsx")

In [3]:
# extract features and target data
X = features.values
y = target['logVDss'].values
# split the data, 80% is the training set and 20% is the test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
def train_model(model, param_grid):
    """This function trains the model and optimizes hyperparameters."""
    if len(param_grid) > 0:
        # setup grid search parameters
        gsearch = GridSearchCV(model,
                               param_grid,
                               cv=10,
                               scoring='neg_mean_squared_error',
                               verbose=1,
                               return_train_score=True)
        # search the grid
        gsearch.fit(X_train, y_train)
        # extract best model from the grid
        model = gsearch.best_estimator_
        best_idx = gsearch.best_index_
        # get cv scores for best model
        grid_results = pd.DataFrame(gsearch.cv_results_)
        cv_mean = abs(grid_results.loc[best_idx, 'mean_test_score'])
        cv_std = grid_results.loc[best_idx, 'std_test_score']
        print("Best parameters found by grid search are:", gsearch.best_params_)
    else:
        grid_results = []
        # model training
        model = model.fit(X_train, y_train)
        cv_results = cross_val_score(model,
                                     X_train,
                                     y_train,
                                     cv=10,
                                     scoring='neg_mean_squared_error')
        cv_mean = abs(np.mean(cv_results))
        cv_std = np.std(cv_results)
    # calculate the MSE and R2 of the train and test sets
    MSE_train_score = mean_squared_error(y_train, model.predict(X_train))
    MSE_test_score = mean_squared_error(y_test, model.predict(X_test))
    R2_train_score = model.score(X_train, y_train)
    R2_test_score = model.score(X_test, y_test)
    test_score = pd.Series({'R2': R2_test_score, 'MSE': MSE_test_score})

    # print stats on model performance
    print('----------------')
    print(model)
    print('----------------')
    print("GridSearchCV train R2:  ", R2_train_score)
    print("GridSearchCV train MSE:  ", MSE_train_score)
    print("GridSearchCV test R2:  ", R2_test_score)
    print("GridSearchCV test MSE:  ", MSE_test_score)
    print('cross_val: mean=', cv_mean, ', std=', cv_std)

    return model, test_score, grid_results

In [5]:
def fold_n(n, fold_error):
    # count the number of fold_error less than n
    number = sum[abs(fold_error["FE_score"]) < n]
    # calculate n-fold fold error
    n_fold_error = number / len(fold_error)
    
    return n_fold_error

In [6]:
def cross_validation(folds, lgb_reg):
    """This function gets the MSE, R2, MAE, RMSE values of the model's ten-fold cross-validation."""
    FE_score = []
    n_fold_error = []
    # ten-fold cross-validation
    kf = KFold(n_splits=folds, shuffle=True, random_state=42)
    # record training and prediction MSE, R2, MAE, RMSE values
    score_dict = {'train_mse': [], 'test_mse': [], 'train_r2': [], 'test_r2': [], 'train_mae': [], 'test_mae': [],
                  'train_rmse': [], 'test_rmse': []}

    for i, (train_index, test_index) in enumerate(kf.split(X)):
        # split training and test set
        X_train_KFold, X_test_KFold = X[train_index], X[test_index]
        y_train_KFold, y_test_KFold = y[train_index], y[test_index]
        # train model
        lgb_reg.fit(X=X_train_KFold,
                    y=y_train_KFold)

        # make predictions
        y_train_KFold_predict = lgb_reg.predict(X_train_KFold)
        y_test_KFold_predict = lgb_reg.predict(X_test_KFold)

        # calculate train and test set MSE, R2, MAE, RMSE
        train_r2 = lgb_reg.score(X_train_KFold, y_train_KFold)
        test_r2 = lgb_reg.score(X_test_KFold, y_test_KFold)
        train_mse = mean_squared_error(y_train_KFold_predict, y_train_KFold)
        test_mse = mean_squared_error(y_test_KFold_predict, y_test_KFold)
        train_mae = mean_absolute_error(y_train_KFold_predict, y_train_KFold)
        test_mae = mean_absolute_error(y_test_KFold_predict, y_test_KFold)
        train_rmse = train_mse ** 0.5
        test_rmse = test_mse ** 0.5

        # calculate fold error
        for y_pred, y_exp in zip(y_test_KFold_predict, y_test_KFold):
            if y_pred > y_exp:
                z = y_pred / y_exp
            else:
                z = y_exp / y_pred
            FE_score.append(z)

        # merge training and prediction MSE, R2, MAE, RMSE values
        score_dict['train_mse'].append(train_mse)
        score_dict['test_mse'].append(test_mse)
        score_dict['train_r2'].append(train_r2)
        score_dict['test_r2'].append(test_r2)
        score_dict['train_mae'].append(train_mae)
        score_dict['test_mae'].append(test_mae)
        score_dict['train_rmse'].append(train_rmse)
        score_dict['test_rmse'].append(test_rmse)
    score_dict = pd.DataFrame(score_dict)

    FE_score = pd.DataFrame(FE_score, columns=["FE_score"])
    FE_score["FE_score"].astype(float)

    # calculate 2-, 3- and 4-fold error
    for m in (2, 3, 4):
        n_fold_error.append(fold_n(m, FE_score))

    score_dict.loc[0:2, 'fold_error'] = n_fold_error

    return score_dict

In [7]:
# places to store optimal models and scores
opt_models = dict()
model_name = 'XGboost'
opt_models[model_name], origin_score, origin_results = train_model(XGBR(random_state=42), [])

----------------
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=40, num_parallel_tree=1, random_state=42,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)
----------------
GridSearchCV train R2:   0.9965316620296611
GridSearchCV train MSE:   0.0013416053332027014
GridSearchCV test R2:   0.8088156345054106
GridSearchCV test MSE:   0.07511599286784623
cross_val: mean= 0.0821398995189653 , std= 0.01818841235362537


In [8]:
# n_estimators and learning_rate together for hyperparameter optimization
opt_models1 = dict()
opt_models1[model_name], score1, grid_results1 = train_model(XGBR(random_state=42),
                                                             {'n_estimators': np.arange(100, 300, 10),
                                                              'learning_rate': np.arange(0.1, 1, 0.1)}
                                                             )

Fitting 10 folds for each of 180 candidates, totalling 1800 fits
Best parameters found by grid search are: {'learning_rate': 0.1, 'n_estimators': 290}
----------------
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.1, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=290, n_jobs=40, num_parallel_tree=1, random_state=42,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)
----------------
GridSearchCV train R2:   0.9965211854833468
GridSearchCV train MSE:   0.0013456578190126317
GridSearchCV test R2:   0.811799228905939
GridSearchCV test MSE:   0.07394374400151839
cross_val: mean= 0.07556773931180298 , std= 0.019749704358747896


In [9]:
# n_estimators and learning_rate together for hyperparameter optimization
opt_models2 = dict()
opt_models2[model_name], score2, grid_results2 = train_model(XGBR(random_state=42),
                                                             {'n_estimators': np.arange(280, 300, 1),
                                                              'learning_rate': np.linspace(0.01, 0.1, 10)}
                                                             )

Fitting 10 folds for each of 200 candidates, totalling 2000 fits
Best parameters found by grid search are: {'learning_rate': 0.1, 'n_estimators': 295}
----------------
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.1, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=295, n_jobs=40, num_parallel_tree=1, random_state=42,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)
----------------
GridSearchCV train R2:   0.9965430364361609
GridSearchCV train MSE:   0.0013372055415581988
GridSearchCV test R2:   0.8118439845120871
GridSearchCV test MSE:   0.07392615960447048
cross_val: mean= 0.07555193154599735 , std= 0.019744866588559343


In [11]:
# hyperparameter optimization for max_depth
opt_models3 = dict()
opt_models3[model_name], score3, grid_results3 = train_model(XGBR(random_state=42, n_estimators=295, learning_rate=0.1),
                                                             {'max_depth': np.arange(1, 10, 1)}
                                                             )

Fitting 10 folds for each of 9 candidates, totalling 90 fits
Best parameters found by grid search are: {'max_depth': 6}
----------------
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.1, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=295, n_jobs=40, num_parallel_tree=1, random_state=42,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)
----------------
GridSearchCV train R2:   0.9965430364361609
GridSearchCV train MSE:   0.0013372055415581988
GridSearchCV test R2:   0.8118439845120871
GridSearchCV test MSE:   0.07392615960447048
cross_val: mean= 0.07555193154599735 , std= 0.019744866588559343


In [13]:
# hyperparameter optimization for gamma
opt_models4 = dict()
opt_models4[model_name], score4, grid_results4 = train_model(XGBR(random_state=42, n_estimators=295, learning_rate=0.1),
                                                             {'gamma': np.linspace(0, 1, 20)}
                                                             )

Fitting 10 folds for each of 20 candidates, totalling 200 fits
Best parameters found by grid search are: {'gamma': 0.0}
----------------
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0.0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.1, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=295, n_jobs=40, num_parallel_tree=1, random_state=42,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)
----------------
GridSearchCV train R2:   0.9965430364361609
GridSearchCV train MSE:   0.0013372055415581988
GridSearchCV test R2:   0.8118439845120871
GridSearchCV test MSE:   0.07392615960447048
cross_val: mean= 0.07555193154599735 , std= 0.019744866588559343


In [14]:
# hyperparameter optimization for subsample
opt_models5 = dict()
opt_models5[model_name], score5, grid_results5 = train_model(XGBR(random_state=42, n_estimators=295, learning_rate=0.1),
                                                             {'subsample': np.arange(1, 10, 1)}
                                                             )

Fitting 10 folds for each of 9 candidates, totalling 90 fits
Best parameters found by grid search are: {'subsample': 1}
----------------
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.1, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=295, n_jobs=40, num_parallel_tree=1, random_state=42,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)
----------------
GridSearchCV train R2:   0.9965430364361609
GridSearchCV train MSE:   0.0013372055415581988
GridSearchCV test R2:   0.8118439845120871
GridSearchCV test MSE:   0.07392615960447048
cross_val: mean= 0.07555193154599735 , std= 0.019744866588559343


In [15]:
# get the 10-fold cross validation score for the model
score = cross_validation(folds=10, lgb_reg=XGBR(random_state=42, n_estimators=295, learning_rate=0.1))

In [16]:
# add score information before optimization and
# after hyperparameter optimization to the table
score.loc[0:1, 'origin_score'] = [origin_score['R2'], origin_score['MSE']]
score.loc[0:1, 'final_score'] = [score5['R2'], score5['MSE']]

In [17]:
# save file
score.to_excel("XGboost_score.xlsx", index=False)

In [18]:
# 10-fold cross validation mean of test set with respect to R2
score['test_r2'].mean()

0.8330800721534777