In [1]:
import os
import numpy as np
import pandas as pd

# to later draw the tree
import graphviz 
from sklearn import tree

# Machine learning algorithms
from sklearn.ensemble import RandomForestRegressor #Random Forest

# Cross-validation
'''
GridSearchCV = Perform an optimization of the parameters. 可以保证在指定的参数范围内找到精度最高的参数
                1. search for the best parameters for model; 
                2. automatically fit a new model on the training dataset w/ the parameters that 
                    can yield the best cross-validation performance.
cross_val_score = to implement cross-validation in scikit-learn.
PS: When an integer is passed to the cv parameter of cross_val_score():
        cv=int (same as cv=StratifiedKFold(n_splits=int)) is used if the estimator is a classifier 
        and y is either binary or multiclass; In all other cases, KFold is used.
        i.e. 写int=10会自动转换为KFold(n_splits=10),不会转换为StratifiedKFold(n_splits=10)。
'''
from sklearn.model_selection import GridSearchCV, cross_val_score

path = os.getcwd()#get current path
path_up1Dir = os.path.dirname(path)#go up one directory
dataset = pd.read_excel(path_up1Dir +'/x_TotalArea_y_MVPA/x_TotalArea_y_MVPA.xlsx')#to import the preprocessed dataset into a variable

# print(dataset)

In [2]:
X = dataset['X']
y = dataset['Y']
other = pd.DataFrame(dataset.iloc[:,:4])
# print(X, y, other)

In [3]:
X_ = X.values.reshape(-1, 1) # some algorithms need to reshape X if X has a single feature

In [4]:
# Random Forest
'''
Why RandomForestRegressor not RandomForestClassifier?
    - first, there is a huge difference between classifiers and regressors. 
        Classifiers predict a set of specified labels. 
            e.g. Email Spam Detection, where to classify whether an email is either spam (1) or not spam(0) . 
        Regressors predict some value, which could be almost anything. 
            (predict real valued outputs which vary and dont require outputs predicted to be in a fixed set)
            e.g. Predicting the runs scored by a team in a cricket match.
    - in our case, we want to use data to predict MVPA_minutes.week from any of the four Total Area (cm²) average scores.
    - so, we are predicting a quantity instead of a label, which means we should use regressor.
'''

'''
The main parameters to adjust when using these methods is n_estimators and max_features.
Another important parameter is max_depth.
# n_estimators = the number of trees in the forest. 
                The larger the better, but also the longer it takes to compute.
# max_features = the number of features that are selected, aka how random each tree is.
                The lower the greater the reduction of variance, but also the greater the increase in bias.
                range = (0,1]
# max_depth = 可以生成多少层叶子, to reduce the complexity of each tree to prevent overfitting, often not deeper than five splits
# bootstrap = True: 使用自助采样法; False: 使用整个数据集.
'''

# Set param_grid, aka the main parameters in RandomForestRegressor
param_grid_RandomForestRegressor = {
    'n_estimators':np.arange(10,101,10), # 从10到100，每隔10取一个
    'max_features':np.arange(0.1,1.1,0.1), # 从0.1到1，每隔0.1取一个
    'max_depth':np.arange(1,10),
    'bootstrap':[True,False]
}

# GridSearchCV
rfg = GridSearchCV(estimator=RandomForestRegressor(), # algorithm - Random Forest Regressor
                    param_grid=param_grid_RandomForestRegressor, # specify the parameters to search over using a dict or list of dictionaries
                    cv=10 # 10-Fold
                    )

# Build the model, aka training the dataset
rfg.fit(X_, y)

# Output the best parameter, cross-validation score, estimator, and the index of best estimator.
print("\n------------------ RandomForestRegressor Model")
print("Best parameter: {}".format(rfg.best_params_))
print("Best cross-validation score: {:.2f}".format(rfg.best_score_))
print("Average score in 10-Fold: \n", rfg.cv_results_['mean_test_score'])
print("Std score in 10-Fold: \n", rfg.cv_results_['std_test_score'])
print("Best estimator: {}".format(rfg.best_estimator_))
print("The Index of Best estimator: {}".format(rfg.best_index_))

KeyboardInterrupt: 

In [6]:
'''
机器学习不同于统计建模，有些定义不一样。sklearn这个包定义的比较奇怪而已。
https://zhuanlan.zhihu.com/p/369330147，score接近于1越好的是R2这个指标，其他的指标是接近于0越好。把负数去掉即可。
'''
# 10-Fold Cross-validation to check its accuracy again
score = cross_val_score(estimator=RandomForestRegressor(max_depth=2, max_features=0.8, n_estimators=30),
                        X=X_, y=y,
                        cv=10
                       )
print(score)

[-0.02425016  0.00810475 -0.02313557 -0.04395725 -0.03029508 -0.01545422
 -0.05264693 -0.03870775 -0.0228591  -0.0155182 ]


In [7]:
# Use the above optimal parameters to build new model, aka training the dataset
rfg = RandomForestRegressor(max_depth=2, max_features=0.8, n_estimators=30).fit(X_, y)

In [8]:
# Draw the Random Forest
rfg_tree = rfg.estimators_[0] # Draw the first tree

# Export a decision tree w/ color in DOT format.
dot_data = tree.export_graphviz(decision_tree=rfg_tree, # decision tree classifier
                               filled=True, # True: paint nodes to indicate majority class for classification, extremity of values for regression, or purity of node for multi-output
                               rounded=True, # True: draw node boxes with rounded corners and use Helvetica fonts instead of Times-Roman
                               special_characters=True # True: do not ignore special characters for PostScript compatibility
                               )

# Output the graph
graph = graphviz.Source(dot_data)
graph.render(filename='x_TotalArea_y_MVPA_RandomForest', 
            directory=path_up1Dir +'/x_TotalArea_y_MVPA')

'/Users/zclalala/Documents/GitHub/project-posture/x_TotalArea_y_MVPA/x_TotalArea_y_MVPA_RandomForest.pdf'