In [1]:
# -*- coding: utf-8 -*-
# learning
# author: Cheng Zheng

import os
import numpy as np
import pandas as pd

# Machine learning algorithms
from sklearn.kernel_ridge import KernelRidge #Kernel ridge regression

# Cross-validation
'''
GridSearchCV = Perform an optimization of the parameters. 可以保证在指定的参数范围内找到精度最高的参数
                1. search for the best parameters for model; 
                2. automatically fit a new model on the training dataset w/ the parameters that 
                    can yield the best cross-validation performance.
cross_val_score = to implement cross-validation in scikit-learn.
KFold: just to divide the dataset.
    - When an integer is passed to the cv parameter of cross_val_score():
        cv=int (same as cv=StratifiedKFold(n_splits=int)) is used if the estimator is a classifier 
        and y is either binary or multiclass; In all other cases, KFold is used.
'''
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold

path = os.getcwd()#get current path
path_up1Dir = os.path.dirname(path)#go up one directory
dataset = pd.read_excel(path_up1Dir +'/x_TotalArea_y_MVPA/x_TotalArea_y_MVPA.xlsx')#to import the preprocessed dataset into a variable

# print(dataset)

In [2]:
X = dataset['X']
y = dataset['Y']
other = pd.DataFrame(dataset.iloc[:,:4])
# print(X, y, other)

In [3]:
X_ = X.values.reshape(-1, 1) # some algorithms need to reshape X if X has a single feature

In [4]:
# Kernel ridge regression
'''
(same as svr)
# kernel = Kernel mapping used internally. This parameter is directly passed to pairwise_kernel. 
    If kernel is a string, it must be one of the metrics in pairwise.PAIRWISE_KERNEL_FUNCTIONS. 
    If kernel is “precomputed”, X is assumed to be a kernel matrix. 
    Alternatively, if kernel is a callable function, it is called on each pair of instances (rows) 
        and the resulting value recorded. 
    The callable should take two rows from X as input and return the corresponding kernel value as a single number. 
# gamma = Gamma parameter for the RBF, laplacian, polynomial, exponential chi2 and sigmoid kernels. 
    Interpretation of the default value is left to the kernel; see the documentation for sklearn.metrics.pairwise. 
    Ignored by other kernels.

(same as ridge)
# alpha = Regularization strength; must be a positive float. 
    Regularization improves the conditioning of the problem and reduces the variance of the estimates. 
    Larger values specify stronger regularization. 
    Alpha corresponds to 1 / (2C) in other linear models such as LogisticRegression or LinearSVC. 
    If an array is passed, penalties are assumed to be specific to the targets. 
    Hence they must correspond in number.
'''

'''
# kernelRidge要考虑的参数是kernel,gamma,alpha，通过GridSearchCV我们需要确定param_grid也就是这个参数有哪些值才能找到最佳模型。
# np.logspace用于创建等比数列, 开始点和结束点是10的幂, 
    i.e. logspace(-2,1,4)表示起始数字为10^-2，结尾数字为10^1，元素个数为4的等比数列
'''
# Set param_grid, aka the main parameters in Ridge
param_grid_KernelRidge = [
    {'kernel':['rbf'],
     'gamma':np.logspace(-2,2,5),
     'alpha':np.logspace(-3,2,20)
    },
    {'kernel':['linear'],
     'alpha':np.logspace(-3,2,20)
    }
]

# GridSearchCV
kr = GridSearchCV(estimator=KernelRidge(), # algorithm - Kernel ridge regression
                    param_grid=param_grid_KernelRidge, # specify the parameters to search over using a dict or list of dictionaries
                    cv=10 # 10-Fold
                    )

# Build the model
kr.fit(X_, y)

# Output the best parameter, cross-validation score, estimator, and the index of best estimator.
print("\n------------------ Kernel Ridge Regression Model")
print("Best parameter: {}".format(kr.best_params_))
print("Best cross-validation score: {:.2f}".format(kr.best_score_))
print("Average score in 10-Fold: \n", kr.cv_results_['mean_test_score'])
print("Std score in 10-Fold: \n", kr.cv_results_['std_test_score'])
print("Best estimator: {}".format(kr.best_estimator_))
print("The Index of Best estimator: {}".format(kr.best_index_))


------------------ Kernel Ridge Regression Model
Best parameter: {'alpha': 100.0, 'kernel': 'linear'}
Best cross-validation score: -0.16
Average score in 10-Fold: 
 [-0.19240391 -0.19635394 -0.21763105 -0.31611868 -0.62420703 -0.19204744
 -0.19395708 -0.21539053 -0.31050308 -0.55224789 -0.19184661 -0.1893635
 -0.21312702 -0.29991644 -0.51572486 -0.1918302  -0.18585448 -0.21095194
 -0.28834158 -0.49073462 -0.19195999 -0.1850942  -0.20893572 -0.27840291
 -0.46942338 -0.19219947 -0.1860981  -0.20709903 -0.2706934  -0.44924276
 -0.1925013  -0.18755016 -0.20544648 -0.2642576  -0.4288102  -0.19278801
 -0.18915885 -0.20393958 -0.25793691 -0.40720226 -0.19299456 -0.19100033
 -0.20248363 -0.2511713  -0.38453595 -0.19309864 -0.1927512  -0.20096786
 -0.24398556 -0.36203686 -0.19309934 -0.19391819 -0.19925489 -0.23665063
 -0.34078871 -0.1929869  -0.19431462 -0.19714036 -0.22938701 -0.32071406
 -0.19273284 -0.19408751 -0.19439767 -0.22224438 -0.30092929 -0.19229979
 -0.19348302 -0.19097613 -0.2151

In [7]:
'''
机器学习不同于统计建模，有些定义不一样。sklearn这个包定义的比较奇怪而已。
https://zhuanlan.zhihu.com/p/369330147，score接近于1越好的是R2这个指标，其他的指标是接近于0越好。把负数去掉即可。
'''
# 10-Fold Cross-validation to check its accuracy again
score = cross_val_score(estimator=KernelRidge(alpha=100.0, kernel='linear'),
                        X=X_, y=y,
                        cv=10
                       )
print(score)

[-5.34259746e-03 -7.40699745e-01 -2.14089043e-01 -2.10426512e-01
 -3.03152261e-01 -4.40012416e-02 -2.42807078e-02 -1.28952663e-03
 -1.18850967e-04 -1.91348385e-02]


In [8]:
# Use the above optimal parameters to build new model, aka training the dataset
kr = KernelRidge(alpha=100.0, kernel='linear').fit(X_, y)