In [1]:
import warnings
import numpy as np
import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, cross_val_score

warnings.filterwarnings("ignore")
plt.rcParams.update({'figure.max_open_warning': 0})

In [2]:
# read data
features = pd.read_excel(r"/opt/jupyter_data/model/feature/final_features.xlsx")
target = pd.read_excel(r"/opt/jupyter_data/model/feature/final_data.xlsx")

# extract features and target data
X = features.values
y = target['logVDss'].values
# split the data, 80% is the training set and 20% is the test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [3]:
def train_model(model, param_grid):
    """This function trains the model and optimizes hyperparameters."""
    if len(param_grid) > 0:
        # setup grid search parameters
        gsearch = GridSearchCV(model,
                               param_grid,
                               cv=10,
                               scoring='neg_mean_squared_error',
                               verbose=1,
                               return_train_score=True)
        # search the grid
        gsearch.fit(X_train, y_train)
        # extract best model from the grid
        model = gsearch.best_estimator_
        best_idx = gsearch.best_index_
        # get cv scores for best model
        grid_results = pd.DataFrame(gsearch.cv_results_)
        cv_mean = abs(grid_results.loc[best_idx, 'mean_test_score'])
        cv_std = grid_results.loc[best_idx, 'std_test_score']
        print("Best parameters found by grid search are:", gsearch.best_params_)
    else:
        grid_results = []
        # model training
        model = model.fit(X_train, y_train)
        cv_results = cross_val_score(model,
                                     X_train,
                                     y_train,
                                     cv=10,
                                     scoring='neg_mean_squared_error')
        cv_mean = abs(np.mean(cv_results))
        cv_std = np.std(cv_results)
    # calculate the MSE and R2 of the train and test sets
    MSE_train_score = mean_squared_error(y_train, model.predict(X_train))
    MSE_test_score = mean_squared_error(y_test, model.predict(X_test))
    R2_train_score = model.score(X_train, y_train)
    R2_test_score = model.score(X_test, y_test)
    test_score = pd.Series({'R2': R2_test_score, 'MSE': MSE_test_score})

    # print stats on model performance
    print('----------------')
    print(model)
    print('----------------')
    print("GridSearchCV train R2:  ", R2_train_score)
    print("GridSearchCV train MSE:  ", MSE_train_score)
    print("GridSearchCV test R2:  ", R2_test_score)
    print("GridSearchCV test MSE:  ", MSE_test_score)
    print('cross_val: mean=', cv_mean, ', std=', cv_std)

    return model, test_score, grid_results

In [4]:
def fold_n(n, fold_error):
    # count the number of fold_error less than n
    number = sum[abs(fold_error["FE_score"]) < n]
    # calculate n-fold fold error
    n_fold_error = number / len(fold_error)
    return n_fold_error

In [5]:
def cross_validation(folds, lgb_reg):
    """This function gets the MSE, R2, MAE, RMSE values of the model's ten-fold cross-validation."""
    FE_score = []
    n_fold_error = []
    # ten-fold cross-validation
    kf = KFold(n_splits=folds, shuffle=True, random_state=42)
    # record training and prediction MSE, R2, MAE, RMSE values
    score_dict = {'train_mse': [], 'test_mse': [], 'train_r2': [], 'test_r2': [], 'train_mae': [], 'test_mae': [],
                  'train_rmse': [], 'test_rmse': []}

    for i, (train_index, test_index) in enumerate(kf.split(X)):
        # split training and test set
        X_train_KFold, X_test_KFold = X[train_index], X[test_index]
        y_train_KFold, y_test_KFold = y[train_index], y[test_index]
        # train model
        lgb_reg.fit(X=X_train_KFold,
                    y=y_train_KFold)

        # make predictions
        y_train_KFold_predict = lgb_reg.predict(X_train_KFold)
        y_test_KFold_predict = lgb_reg.predict(X_test_KFold)

        # calculate train and test set MSE, R2, MAE, RMSE
        train_r2 = lgb_reg.score(X_train_KFold, y_train_KFold)
        test_r2 = lgb_reg.score(X_test_KFold, y_test_KFold)
        train_mse = mean_squared_error(y_train_KFold_predict, y_train_KFold)
        test_mse = mean_squared_error(y_test_KFold_predict, y_test_KFold)
        train_mae = mean_absolute_error(y_train_KFold_predict, y_train_KFold)
        test_mae = mean_absolute_error(y_test_KFold_predict, y_test_KFold)
        train_rmse = train_mse ** 0.5
        test_rmse = test_mse ** 0.5

        # calculate fold error
        for y_pred, y_exp in zip(y_test_KFold_predict, y_test_KFold):
            if y_pred > y_exp:
                z = y_pred / y_exp
            else:
                z = y_exp / y_pred
            FE_score.append(z)

        # merge training and prediction MSE, R2, MAE, RMSE values
        score_dict['train_mse'].append(train_mse)
        score_dict['test_mse'].append(test_mse)
        score_dict['train_r2'].append(train_r2)
        score_dict['test_r2'].append(test_r2)
        score_dict['train_mae'].append(train_mae)
        score_dict['test_mae'].append(test_mae)
        score_dict['train_rmse'].append(train_rmse)
        score_dict['test_rmse'].append(test_rmse)
    score_dict = pd.DataFrame(score_dict)

    FE_score = pd.DataFrame(FE_score, columns=["FE_score"])
    FE_score["FE_score"].astype(float)

    # calculate 2-, 3- and 4-fold error
    for m in (2, 3, 4):
        n_fold_error.append(fold_n(m, FE_score))

    score_dict.loc[0:2, 'fold_error'] = n_fold_error

    return score_dict

In [6]:
# places to store optimal models and scores
opt_models = dict()
model_name = 'LGB'
opt_models[model_name], origin_score, origin_results = train_model(lgb.LGBMRegressor(random_state=42), [])

----------------
LGBMRegressor(random_state=42)
----------------
GridSearchCV train R2:   0.9866676755629826
GridSearchCV train MSE:   0.005157143773662749
GridSearchCV test R2:   0.7988898071722357
GridSearchCV test MSE:   0.07901583255001599
cross_val: mean= 0.07995905254306793 , std= 0.01976270698797487


In [7]:
# n_estimators and learning_rate together for hyperparameter optimization
opt_models1 = dict()
opt_models1[model_name], score1, grid_results1 = train_model(
    lgb.LGBMRegressor(num_leaves=31, random_state=42),
    {'n_estimators': np.arange(100, 300, 10), 'learning_rate': np.arange(0.1, 1, 0.1)})

Fitting 10 folds for each of 180 candidates, totalling 1800 fits
Best parameters found by grid search are: {'learning_rate': 0.1, 'n_estimators': 290}
----------------
LGBMRegressor(n_estimators=290, random_state=42)
----------------
GridSearchCV train R2:   0.9964266698963652
GridSearchCV train MSE:   0.0013822178707289837
GridSearchCV test R2:   0.8087506292343972
GridSearchCV test MSE:   0.07514153332174882
cross_val: mean= 0.07811558495643607 , std= 0.02067011326628066


In [8]:
# n_estimators and learning_rate together for hyperparameter optimization
opt_models2 = dict()
opt_models2[model_name], score2, grid_results2 = train_model(
    lgb.LGBMRegressor(num_leaves=31, random_state=42),
    {'n_estimators': np.arange(280, 300, 1), 'learning_rate': np.linspace(0.01, 0.1, 10)})

Fitting 10 folds for each of 200 candidates, totalling 2000 fits
Best parameters found by grid search are: {'learning_rate': 0.07, 'n_estimators': 299}
----------------
LGBMRegressor(learning_rate=0.07, n_estimators=299, random_state=42)
----------------
GridSearchCV train R2:   0.9958155553942302
GridSearchCV train MSE:   0.001618606158800473
GridSearchCV test R2:   0.8137978351784887
GridSearchCV test MSE:   0.07315849519664831
cross_val: mean= 0.0765047345616303 , std= 0.018431926539873918


In [9]:
# n_estimators and learning_rate together for hyperparameter optimization
opt_models3 = dict()
opt_models3[model_name], score3, grid_results3 = train_model(
    lgb.LGBMRegressor(num_leaves=31, random_state=42),
    {'n_estimators': np.arange(299, 305, 1), 'learning_rate': np.linspace(0.01, 0.1, 10)})

Fitting 10 folds for each of 60 candidates, totalling 600 fits
Best parameters found by grid search are: {'learning_rate': 0.07, 'n_estimators': 302}
----------------
LGBMRegressor(learning_rate=0.07, n_estimators=302, random_state=42)
----------------
GridSearchCV train R2:   0.9958521216399912
GridSearchCV train MSE:   0.001604461784535998
GridSearchCV test R2:   0.8137442821169399
GridSearchCV test MSE:   0.07317953609807841
cross_val: mean= 0.07649950677555047 , std= 0.01844616989973527


In [10]:
# hyperparameter optimization for max_depth
opt_models4 = dict()
opt_models4[model_name], score4, grid_results4 = train_model(
    lgb.LGBMRegressor(num_leaves=31, learning_rate=0.07, n_estimators=299, random_state=42),
    {'max_depth': np.arange(1, 20, 1)})

Fitting 10 folds for each of 19 candidates, totalling 190 fits
Best parameters found by grid search are: {'max_depth': 16}
----------------
LGBMRegressor(learning_rate=0.07, max_depth=16, n_estimators=299,
              random_state=42)
----------------
GridSearchCV train R2:   0.995877791578806
GridSearchCV train MSE:   0.0015945322658122351
GridSearchCV test R2:   0.8127985715071635
GridSearchCV test MSE:   0.07355110409337572
cross_val: mean= 0.07653128650679022 , std= 0.01875883074135193


In [11]:
# hyperparameter optimization for num_leaves
opt_models5 = dict()
opt_models5[model_name], score5, grid_results5 = train_model(
    lgb.LGBMRegressor(learning_rate=0.07, n_estimators=299, random_state=42), {'num_leaves': np.arange(2, 50, 1)})

Fitting 10 folds for each of 48 candidates, totalling 480 fits
Best parameters found by grid search are: {'num_leaves': 31}
----------------
LGBMRegressor(learning_rate=0.07, n_estimators=299, random_state=42)
----------------
GridSearchCV train R2:   0.9958155553942302
GridSearchCV train MSE:   0.001618606158800473
GridSearchCV test R2:   0.8137978351784887
GridSearchCV test MSE:   0.07315849519664831
cross_val: mean= 0.0765047345616303 , std= 0.018431926539873918


In [12]:
# hyperparameter optimization for reg_alpha
opt_models6 = dict()
opt_models6[model_name], score6, grid_results6 = train_model(
    lgb.LGBMRegressor(learning_rate=0.07, n_estimators=299, random_state=42), {'reg_alpha': np.linspace(0, 1, 10)})

Fitting 10 folds for each of 10 candidates, totalling 100 fits
Best parameters found by grid search are: {'reg_alpha': 0.0}
----------------
LGBMRegressor(learning_rate=0.07, n_estimators=299, random_state=42)
----------------
GridSearchCV train R2:   0.9958155553942302
GridSearchCV train MSE:   0.001618606158800473
GridSearchCV test R2:   0.8137978351784887
GridSearchCV test MSE:   0.07315849519664831
cross_val: mean= 0.0765047345616303 , std= 0.018431926539873918


In [13]:
# get the 10-fold cross validation score for the model
score = cross_validation(folds=10, lgb_reg=lgb.LGBMRegressor(learning_rate=0.07, n_estimators=299, random_state=42))

In [14]:
# add score information before optimization and
# after hyperparameter optimization to the table
score.loc[0:1, 'origin_score'] = [origin_score['R2'], origin_score['MSE']]
score.loc[0:1, 'final_score'] = [score6['R2'], score6['MSE']]

In [15]:
# save file
score.to_excel("LightGBM_score.xlsx", index=False)

In [16]:
# 10-fold cross validation mean of test set with respect to R2
score['test_r2'].mean()

0.8366969495222264