In [4]:
import os
import numpy as np
import pandas as pd

# Machine learning algorithms
from sklearn.svm import SVR #Support vector machine

# Cross-validation
'''
GridSearchCV = Perform an optimization of the parameters. 可以保证在指定的参数范围内找到精度最高的参数
                1. search for the best parameters for model; 
                2. automatically fit a new model on the training dataset w/ the parameters that 
                    can yield the best cross-validation performance.
cross_val_score = to implement cross-validation in scikit-learn.
PS: When an integer is passed to the cv parameter of cross_val_score():
        cv=int (same as cv=StratifiedKFold(n_splits=int)) is used if the estimator is a classifier 
        and y is either binary or multiclass; In all other cases, KFold is used.
        i.e. 写int=10会自动转换为KFold(n_splits=10),不会转换为StratifiedKFold(n_splits=10)。
'''
from sklearn.model_selection import GridSearchCV, cross_val_score

path = os.getcwd()#get current path
path_up1Dir = os.path.dirname(path)#go up one directory
dataset = pd.read_excel(path_up1Dir +'/x_TotalArea_y_MVPA/x_TotalArea_y_MVPA.xlsx')#to import the preprocessed dataset into a variable

# print(dataset)

In [2]:
X = dataset['X']
y = dataset['Y']
other = pd.DataFrame(dataset.iloc[:,:4])
# print(X, y, other)

In [3]:
X_ = X.values.reshape(-1, 1) # some algorithms need to reshape X if X has a single feature

In [None]:
# Support Vector Machine
'''
The number of weak learners is controlled by the parameter n_estimators. 
The learning_rate parameter controls the contribution of the weak learners in the final combination. 
# kernel = Specifies the kernel type to be used in the algorithm. 
    It must be one of ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’ or a callable. 
    If none is given, ‘rbf’ will be used. If a callable is given it is used to precompute the kernel matrix.
# C = Regularization parameter. The strength of the regularization is inversely proportional to C. 
    Must be strictly positive. The penalty is a squared l2 penalty.
# gamma?
'''

# Set param_grid, aka the main parameters in RandomForestRegressor
param_grid_SVR = {"kernel": ("linear", "rbf"), 
                  "C": range(1, 100)
}

# GridSearchCV
svr = GridSearchCV(estimator=SVR(), # algorithm - AdaBoost Regressor
                    param_grid=param_grid_SVR, # specify the parameters to search over using a dict or list of dictionaries
                    cv=10 # 10-Fold
                    )

# Build the model, aka training the dataset
svr.fit(X_, y)

# Output the best parameter, cross-validation score, estimator, and the index of best estimator.
print("\n------------------ SVR Model")
print("Best parameter: {}".format(svr.best_params_))
print("Best cross-validation score: {:.2f}".format(svr.best_score_))
print("Average score in 10-Fold: \n", svr.cv_results_['mean_test_score'])
print("Std score in 10-Fold: \n", svr.cv_results_['std_test_score'])
print("Best estimator: {}".format(svr.best_estimator_))
print("The Index of Best estimator: {}".format(svr.best_index_))

In [None]:
'''
机器学习不同于统计建模，有些定义不一样。sklearn这个包定义的比较奇怪而已。
https://zhuanlan.zhihu.com/p/369330147，score接近于1越好的是R2这个指标，其他的指标是接近于0越好。把负数去掉即可。
'''
# 10-Fold Cross-validation to check its accuracy again
score = cross_val_score(estimator=RandomForestRegressor(max_depth=2, max_features=0.8, n_estimators=30),
                        X=X_, y=y,
                        cv=10
                       )
print(score)

In [None]:
# Use the above optimal parameters to build new model, aka training the dataset
svr = RandomForestRegressor(max_depth=2, max_features=0.8, n_estimators=30).fit(X_, y)

In [None]:
# Draw the SupportVectorMachine