In [1]:
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold

warnings.filterwarnings("ignore")
plt.rcParams.update({'figure.max_open_warning': 0})

In [2]:
# read data
features = pd.read_excel(r"/opt/jupyter_data/model/feature/final_features.xlsx")
target = pd.read_excel(r"/opt/jupyter_data/model/feature/final_data.xlsx")

In [3]:
# extract features and target data
X = features.values
y = target['logVDss'].values
# split the data, 80% is the training set and 20% is the test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
def train_model(model, param_grid):
    """This function trains the model and optimizes hyperparameters."""
    if len(param_grid) > 0:
        # setup grid search parameters
        gsearch = GridSearchCV(model,
                               param_grid,
                               cv=10,
                               scoring='neg_mean_squared_error',
                               verbose=1,
                               return_train_score=True)
        # search the grid
        gsearch.fit(X_train, y_train)
        # extract best model from the grid
        model = gsearch.best_estimator_
        best_idx = gsearch.best_index_
        # get cv scores for best model
        grid_results = pd.DataFrame(gsearch.cv_results_)
        cv_mean = abs(grid_results.loc[best_idx, 'mean_test_score'])
        cv_std = grid_results.loc[best_idx, 'std_test_score']
        print("Best parameters found by grid search are:", gsearch.best_params_)
    else:
        grid_results = []
        # model training
        model = model.fit(X_train, y_train)
        cv_results = cross_val_score(model,
                                     X_train,
                                     y_train,
                                     cv=10,
                                     scoring='neg_mean_squared_error')
        cv_mean = abs(np.mean(cv_results))
        cv_std = np.std(cv_results)
    # calculate the MSE and R2 of the train and test sets
    MSE_train_score = mean_squared_error(y_train, model.predict(X_train))
    MSE_test_score = mean_squared_error(y_test, model.predict(X_test))
    R2_train_score = model.score(X_train, y_train)
    R2_test_score = model.score(X_test, y_test)
    test_score = pd.Series({'R2': R2_test_score, 'MSE': MSE_test_score})

    # print stats on model performance
    print('----------------')
    print(model)
    print('----------------')
    print("GridSearchCV train R2:  ", R2_train_score)
    print("GridSearchCV train MSE:  ", MSE_train_score)
    print("GridSearchCV test R2:  ", R2_test_score)
    print("GridSearchCV test MSE:  ", MSE_test_score)
    print('cross_val: mean=', cv_mean, ', std=', cv_std)

    return model, test_score, grid_results

In [5]:
def fold_n(n, fold_error):
    # count the number of fold_error less than n
    number = sum[abs(fold_error["FE_score"]) < n]
    # calculate n-fold fold error
    n_fold_error = number / len(fold_error)

    return n_fold_error

In [6]:
def cross_validation(folds, lgb_reg):
    """This function gets the MSE, R2, MAE, RMSE values of the model's ten-fold cross-validation."""
    FE_score = []
    n_fold_error = []
    # ten-fold cross-validation
    kf = KFold(n_splits=folds, shuffle=True, random_state=42)
    # record training and prediction MSE, R2, MAE, RMSE values
    score_dict = {'train_mse': [], 'test_mse': [], 'train_r2': [], 'test_r2': [], 'train_mae': [], 'test_mae': [],
                  'train_rmse': [], 'test_rmse': []}

    for i, (train_index, test_index) in enumerate(kf.split(X)):
        # split training and test set
        X_train_KFold, X_test_KFold = X[train_index], X[test_index]
        y_train_KFold, y_test_KFold = y[train_index], y[test_index]
        # train model
        lgb_reg.fit(X=X_train_KFold,
                    y=y_train_KFold)

        # make predictions
        y_train_KFold_predict = lgb_reg.predict(X_train_KFold)
        y_test_KFold_predict = lgb_reg.predict(X_test_KFold)

        # calculate train and test set MSE, R2, MAE, RMSE
        train_r2 = lgb_reg.score(X_train_KFold, y_train_KFold)
        test_r2 = lgb_reg.score(X_test_KFold, y_test_KFold)
        train_mse = mean_squared_error(y_train_KFold_predict, y_train_KFold)
        test_mse = mean_squared_error(y_test_KFold_predict, y_test_KFold)
        train_mae = mean_absolute_error(y_train_KFold_predict, y_train_KFold)
        test_mae = mean_absolute_error(y_test_KFold_predict, y_test_KFold)
        train_rmse = train_mse ** 0.5
        test_rmse = test_mse ** 0.5

        # calculate fold error
        for y_pred, y_exp in zip(y_test_KFold_predict, y_test_KFold):
            if y_pred > y_exp:
                z = y_pred / y_exp
            else:
                z = y_exp / y_pred
            FE_score.append(z)

        # merge training and prediction MSE, R2, MAE, RMSE values
        score_dict['train_mse'].append(train_mse)
        score_dict['test_mse'].append(test_mse)
        score_dict['train_r2'].append(train_r2)
        score_dict['test_r2'].append(test_r2)
        score_dict['train_mae'].append(train_mae)
        score_dict['test_mae'].append(test_mae)
        score_dict['train_rmse'].append(train_rmse)
        score_dict['test_rmse'].append(test_rmse)
    score_dict = pd.DataFrame(score_dict)

    FE_score = pd.DataFrame(FE_score, columns=["FE_score"])
    FE_score["FE_score"].astype(float)

    # calculate 2-, 3- and 4-fold error
    for m in (2, 3, 4):
        n_fold_error.append(fold_n(m, FE_score))

    score_dict.loc[0:2, 'fold_error'] = n_fold_error

    return score_dict

In [7]:
# places to store optimal models and scores
opt_models = dict()
model_name = 'RandomForest'
opt_models[model_name], origin_score, origin_results = train_model(RandomForestRegressor(random_state=42), [])

----------------
RandomForestRegressor(random_state=42)
----------------
GridSearchCV train R2:   0.9681576573082361
GridSearchCV train MSE:   0.012317097451943088
GridSearchCV test R2:   0.7765437456068218
GridSearchCV test MSE:   0.08779555989241515
cross_val: mean= 0.08922285405412557 , std= 0.019854752827106813


In [8]:
# hyperparameter optimization for n_estimators
opt_models1 = dict()
opt_models1[model_name], score1, grid_results1 = train_model(
    RandomForestRegressor(random_state=42),
    {'n_estimators': np.arange(100, 500, 10)})

Fitting 10 folds for each of 40 candidates, totalling 400 fits
Best parameters found by grid search are: {'n_estimators': 290}
----------------
RandomForestRegressor(n_estimators=290, random_state=42)
----------------
GridSearchCV train R2:   0.9694002410461191
GridSearchCV train MSE:   0.011836447358454117
GridSearchCV test R2:   0.7815347965383582
GridSearchCV test MSE:   0.0858345849706088
cross_val: mean= 0.08779526485126396 , std= 0.01810552423251346


In [9]:
# hyperparameter optimization for n_estimators
opt_models2 = dict()
opt_models2[model_name], score2, grid_results2 = train_model(RandomForestRegressor(random_state=42),
                                                             {'n_estimators': np.arange(270, 310, 1)})

Fitting 10 folds for each of 40 candidates, totalling 400 fits
Best parameters found by grid search are: {'n_estimators': 294}
----------------
RandomForestRegressor(n_estimators=294, random_state=42)
----------------
GridSearchCV train R2:   0.9693486520035243
GridSearchCV train MSE:   0.011856402776660741
GridSearchCV test R2:   0.7818210603311847
GridSearchCV test MSE:   0.08572211244198624
cross_val: mean= 0.08775696146054007 , std= 0.01805269790692412


In [10]:
# hyperparameter optimization for max_depth
opt_models3 = dict()
opt_models3[model_name], score3, grid_results3 = train_model(RandomForestRegressor(n_estimators=294, random_state=42),
                                                             {'max_depth': np.arange(1, 20, 1)})

Fitting 10 folds for each of 19 candidates, totalling 190 fits
Best parameters found by grid search are: {'max_depth': 19}
----------------
RandomForestRegressor(max_depth=19, n_estimators=294, random_state=42)
----------------
GridSearchCV train R2:   0.9689606735500861
GridSearchCV train MSE:   0.01200647868239768
GridSearchCV test R2:   0.7793724299599267
GridSearchCV test MSE:   0.0866841749047173
cross_val: mean= 0.08781414551415896 , std= 0.01796841485553824


In [11]:
# hyperparameter optimization for max_depth
opt_models4 = dict()
opt_models4[model_name], score4, grid_results4 = train_model(RandomForestRegressor(n_estimators=294, random_state=42),
                                                             {'max_depth': np.arange(19, 30, 1)})

Fitting 10 folds for each of 11 candidates, totalling 110 fits
Best parameters found by grid search are: {'max_depth': 23}
----------------
RandomForestRegressor(max_depth=23, n_estimators=294, random_state=42)
----------------
GridSearchCV train R2:   0.9693697957382192
GridSearchCV train MSE:   0.011848224061820048
GridSearchCV test R2:   0.7813276676553738
GridSearchCV test MSE:   0.08591596553568134
cross_val: mean= 0.08759423518456019 , std= 0.0178950260593276


In [12]:
# hyperparameter optimization for min_samples_leaf
opt_models5 = dict()
opt_models5[model_name], score5, grid_results5 = train_model(RandomForestRegressor(n_estimators=294, random_state=42),
                                                             {'min_samples_leaf': np.arange(1, 11, 1)})

Fitting 10 folds for each of 10 candidates, totalling 100 fits
Best parameters found by grid search are: {'min_samples_leaf': 1}
----------------
RandomForestRegressor(n_estimators=294, random_state=42)
----------------
GridSearchCV train R2:   0.9693486520035243
GridSearchCV train MSE:   0.011856402776660741
GridSearchCV test R2:   0.7818210603311847
GridSearchCV test MSE:   0.08572211244198624
cross_val: mean= 0.08775696146054007 , std= 0.01805269790692412


In [13]:
# hyperparameter optimization for min_samples_split
opt_models6 = dict()
opt_models6[model_name], score6, grid_results6 = train_model(RandomForestRegressor(n_estimators=294, random_state=42),
                                                             {'min_samples_split': np.arange(2, 22, 1)})

Fitting 10 folds for each of 20 candidates, totalling 200 fits
Best parameters found by grid search are: {'min_samples_split': 2}
----------------
RandomForestRegressor(n_estimators=294, random_state=42)
----------------
GridSearchCV train R2:   0.9693486520035243
GridSearchCV train MSE:   0.011856402776660741
GridSearchCV test R2:   0.7818210603311847
GridSearchCV test MSE:   0.08572211244198624
cross_val: mean= 0.08775696146054007 , std= 0.01805269790692412


In [14]:
# get the 10-fold cross validation score for the model
score = cross_validation(folds=10, lgb_reg=RandomForestRegressor(n_estimators=294, random_state=42))

In [15]:
# add score information before optimization and
# after hyperparameter optimization to the table
score.loc[0:1, 'origin_score'] = [origin_score['R2'], origin_score['MSE']]
score.loc[0:1, 'final_score'] = [score2['R2'], score2['MSE']]

In [16]:
# save file
score.to_excel("RF_score.xlsx", index=False)

In [17]:
# 10-fold cross validation mean of test set with respect to R2
score['test_r2'].mean()

0.7995211582593506