In [1]:
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.svm import SVR
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split, RepeatedKFold, GridSearchCV, cross_val_score, KFold

warnings.filterwarnings("ignore")
plt.rcParams.update({'figure.max_open_warning': 0})

In [2]:
# read data
features = pd.read_excel(r"/opt/jupyter_data/model/feature/final_features.xlsx")
target = pd.read_excel(r"/opt/jupyter_data/model/feature/final_data.xlsx")

In [3]:
# extract features and target data
X = features.values
y = target['logVDss'].values
# split the data, 80% is the training set and 20% is the test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
def train_model(model, param_grid):
    """This function trains the model and optimizes hyperparameters."""
    if len(param_grid) > 0:
        # setup grid search parameters
        gsearch = GridSearchCV(model,
                               param_grid,
                               cv=10,
                               scoring='neg_mean_squared_error',
                               verbose=1,
                               return_train_score=True)
        # search the grid
        gsearch.fit(X_train, y_train)
        # extract best model from the grid
        model = gsearch.best_estimator_
        best_idx = gsearch.best_index_
        # get cv scores for best model
        grid_results = pd.DataFrame(gsearch.cv_results_)
        cv_mean = abs(grid_results.loc[best_idx, 'mean_test_score'])
        cv_std = grid_results.loc[best_idx, 'std_test_score']
        print("Best parameters found by grid search are:", gsearch.best_params_)
    else:
        grid_results = []
        # model training
        model = model.fit(X_train, y_train)
        cv_results = cross_val_score(model,
                                     X_train,
                                     y_train,
                                     cv=10,
                                     scoring='neg_mean_squared_error')
        cv_mean = abs(np.mean(cv_results))
        cv_std = np.std(cv_results)
    # calculate the MSE and R2 of the train and test sets
    MSE_train_score = mean_squared_error(y_train, model.predict(X_train))
    MSE_test_score = mean_squared_error(y_test, model.predict(X_test))
    R2_train_score = model.score(X_train, y_train)
    R2_test_score = model.score(X_test, y_test)
    test_score = pd.Series({'R2': R2_test_score, 'MSE': MSE_test_score})

    # print stats on model performance
    print('----------------')
    print(model)
    print('----------------')
    print("GridSearchCV train R2:  ", R2_train_score)
    print("GridSearchCV train MSE:  ", MSE_train_score)
    print("GridSearchCV test R2:  ", R2_test_score)
    print("GridSearchCV test MSE:  ", MSE_test_score)
    print('cross_val: mean=', cv_mean, ', std=', cv_std)

    return model, test_score, grid_results

In [5]:
def fold_n(n, fold_error):
    # count the number of fold_error less than n
    number = sum[abs(fold_error["FE_score"]) < n]
    # calculate n-fold fold error
    n_fold_error = number / len(fold_error)

    return n_fold_error

In [6]:
def cross_validation(folds, lgb_reg):
    """This function gets the MSE, R2, MAE, RMSE values of the model's ten-fold cross-validation."""
    FE_score = []
    n_fold_error = []
    # ten-fold cross-validation
    kf = KFold(n_splits=folds, shuffle=True, random_state=42)
    # record training and prediction MSE, R2, MAE, RMSE values
    score_dict = {'train_mse': [], 'test_mse': [], 'train_r2': [], 'test_r2': [], 'train_mae': [], 'test_mae': [],
                  'train_rmse': [], 'test_rmse': []}

    for i, (train_index, test_index) in enumerate(kf.split(X)):
        # split training and test set
        X_train_KFold, X_test_KFold = X[train_index], X[test_index]
        y_train_KFold, y_test_KFold = y[train_index], y[test_index]
        # train model
        lgb_reg.fit(X=X_train_KFold,
                    y=y_train_KFold)

        # make predictions
        y_train_KFold_predict = lgb_reg.predict(X_train_KFold)
        y_test_KFold_predict = lgb_reg.predict(X_test_KFold)

        # calculate train and test set MSE, R2, MAE, RMSE
        train_r2 = lgb_reg.score(X_train_KFold, y_train_KFold)
        test_r2 = lgb_reg.score(X_test_KFold, y_test_KFold)
        train_mse = mean_squared_error(y_train_KFold_predict, y_train_KFold)
        test_mse = mean_squared_error(y_test_KFold_predict, y_test_KFold)
        train_mae = mean_absolute_error(y_train_KFold_predict, y_train_KFold)
        test_mae = mean_absolute_error(y_test_KFold_predict, y_test_KFold)
        train_rmse = train_mse ** 0.5
        test_rmse = test_mse ** 0.5

        # calculate fold error
        for y_pred, y_exp in zip(y_test_KFold_predict, y_test_KFold):
            if y_pred > y_exp:
                z = y_pred / y_exp
            else:
                z = y_exp / y_pred
            FE_score.append(z)

        # merge training and prediction MSE, R2, MAE, RMSE values
        score_dict['train_mse'].append(train_mse)
        score_dict['test_mse'].append(test_mse)
        score_dict['train_r2'].append(train_r2)
        score_dict['test_r2'].append(test_r2)
        score_dict['train_mae'].append(train_mae)
        score_dict['test_mae'].append(test_mae)
        score_dict['train_rmse'].append(train_rmse)
        score_dict['test_rmse'].append(test_rmse)
    score_dict = pd.DataFrame(score_dict)

    FE_score = pd.DataFrame(FE_score, columns=["FE_score"])
    FE_score["FE_score"].astype(float)

    # calculate 2-, 3- and 4-fold error
    for m in (2, 3, 4):
        n_fold_error.append(fold_n(m, FE_score))

    score_dict.loc[0:2, 'fold_error'] = n_fold_error

    return score_dict

In [7]:
# places to store optimal models and scores
model_name = 'SVR'
opt_models = dict()
opt_models[model_name], origin_score, origin_results = train_model(SVR(), [])

----------------
SVR()
----------------
GridSearchCV train R2:   0.8991572870290523
GridSearchCV train MSE:   0.03900747928646466
GridSearchCV test R2:   0.7152277420707409
GridSearchCV test MSE:   0.11188650724779094
cross_val: mean= 0.10738521149025235 , std= 0.017885336046569875


In [8]:
# hyperparameter optimization for kernel
opt_models1 = dict()
opt_models1[model_name], score1, grid_results1 = train_model(SVR(), {'kernel': ['linear', 'rbf', 'poly', 'sigmod']})

Fitting 10 folds for each of 4 candidates, totalling 40 fits
Best parameters found by grid search are: {'kernel': 'rbf'}
----------------
SVR()
----------------
GridSearchCV train R2:   0.8991572870290523
GridSearchCV train MSE:   0.03900747928646466
GridSearchCV test R2:   0.7152277420707409
GridSearchCV test MSE:   0.11188650724779094
cross_val: mean= 0.10738521149025235 , std= 0.017885336046569875


In [9]:
# hyperparameter optimization for degree
opt_models2 = dict()
opt_models2[model_name], score2, grid_results2 = train_model(SVR(), {'degree': np.arange(1, 10, 1)})

Fitting 10 folds for each of 9 candidates, totalling 90 fits
Best parameters found by grid search are: {'degree': 1}
----------------
SVR(degree=1)
----------------
GridSearchCV train R2:   0.8991572870290523
GridSearchCV train MSE:   0.03900747928646466
GridSearchCV test R2:   0.7152277420707409
GridSearchCV test MSE:   0.11188650724779094
cross_val: mean= 0.10738521149025235 , std= 0.017885336046569875


In [10]:
# hyperparameter optimization for gamma
opt_models3 = dict()
opt_models3[model_name], score3, grid_results3 = train_model(
    SVR(kernel='rbf'),
    {'gamma': np.linspace(0.001, 0.05, 50)})

Fitting 10 folds for each of 50 candidates, totalling 500 fits
Best parameters found by grid search are: {'gamma': 0.013000000000000001}
----------------
SVR(gamma=0.013000000000000001)
----------------
GridSearchCV train R2:   0.946555550628166
GridSearchCV train MSE:   0.02067311747601363
GridSearchCV test R2:   0.7368097470814301
GridSearchCV test MSE:   0.10340697634962955
cross_val: mean= 0.10091838564492481 , std= 0.017268624221596467


In [11]:
# hyperparameter optimization for C
opt_models4 = dict()
opt_models4[model_name], score4, grid_results4 = train_model(SVR(kernel='rbf', gamma=0.013),
                                                             {'C': np.arange(11, 35, 1)})

Fitting 10 folds for each of 24 candidates, totalling 240 fits
Best parameters found by grid search are: {'C': 16}
----------------
SVR(C=16, gamma=0.013)
----------------
GridSearchCV train R2:   0.9730075813253395
GridSearchCV train MSE:   0.010441073839879135
GridSearchCV test R2:   0.7586696877551795
GridSearchCV test MSE:   0.09481824503003129
cross_val: mean= 0.09539561599870147 , std= 0.018793953670501132


In [12]:
# C and gamma together for hyperparameter optimization
opt_models5 = dict()
opt_models5[model_name], score5, grid_results5 = train_model(SVR(kernel='rbf'),
                                                             {'C': np.arange(11, 30, 1),
                                                              'gamma': np.linspace(0.001, 0.02, 20)})

Fitting 10 folds for each of 380 candidates, totalling 3800 fits
Best parameters found by grid search are: {'C': 17, 'gamma': 0.012}
----------------
SVR(C=17, gamma=0.012)
----------------
GridSearchCV train R2:   0.9730952367148185
GridSearchCV train MSE:   0.01040716741581821
GridSearchCV test R2:   0.7586572604484132
GridSearchCV test MSE:   0.09482312769647744
cross_val: mean= 0.09523772148494879 , std= 0.019116916456969734


In [13]:
# hyperparameter optimization for epsilon
opt_models6 = dict()
opt_models6[model_name], score6, grid_results6 = train_model(SVR(C=16, gamma=0.013),
                                                             {'epsilon': np.linspace(0.001, 0.5, 10)})

Fitting 10 folds for each of 10 candidates, totalling 100 fits
Best parameters found by grid search are: {'epsilon': 0.001}
----------------
SVR(C=16, epsilon=0.001, gamma=0.013)
----------------
GridSearchCV train R2:   0.993311820178232
GridSearchCV train MSE:   0.0025870886271864295
GridSearchCV test R2:   0.7677274903755286
GridSearchCV test MSE:   0.09125945069416418
cross_val: mean= 0.09225292003785175 , std= 0.01904133481982348


In [14]:
# hyperparameter optimization for epsilon
opt_models7 = dict()
opt_models7[model_name], score7, grid_results7 = train_model(SVR(C=16, gamma=0.013),
                                                             {'epsilon': np.linspace(0, 0.001, 10)})

Fitting 10 folds for each of 10 candidates, totalling 100 fits
Best parameters found by grid search are: {'epsilon': 0.001}
----------------
SVR(C=16, epsilon=0.001, gamma=0.013)
----------------
GridSearchCV train R2:   0.993311820178232
GridSearchCV train MSE:   0.0025870886271864295
GridSearchCV test R2:   0.7677274903755286
GridSearchCV test MSE:   0.09125945069416418
cross_val: mean= 0.09225292003785175 , std= 0.01904133481982348


In [15]:
# get the 10-fold cross validation score for the model
score = cross_validation(folds=10, lgb_reg=SVR(C=16, gamma=0.013, epsilon=0.001))

In [16]:
# add score information before optimization and
# after hyperparameter optimization to the table
score.loc[0:1, 'origin_score'] = [origin_score['R2'], origin_score['MSE']]
score.loc[0:1, 'final_score'] = [score5['R2'], score5['MSE']]

In [17]:
# save file
score.to_excel("SVR_score.xlsx", index=False)

In [18]:
# 10-fold cross validation mean of test set with respect to R2
score['test_r2'].mean()

0.8063610882098289