In [1]:
import os
import numpy as np
import pandas as pd

# to later draw the forest
import graphviz 
from sklearn import tree

# Machine learning algorithms
from sklearn.ensemble import RandomForestRegressor #Random Forest

# Cross-validation
'''
GridSearchCV = Perform an optimization of the parameters. 可以保证在指定的参数范围内找到精度最高的参数
                1. search for the best parameters for model; 
                2. automatically fit a new model on the training dataset w/ the parameters that 
                    can yield the best cross-validation performance.
cross_val_score = to implement cross-validation in scikit-learn.
KFold: just to divide the dataset.
    - When an integer is passed to the cv parameter of cross_val_score():
        cv=int (same as cv=StratifiedKFold(n_splits=int)) is used if the estimator is a classifier 
        and y is either binary or multiclass; In all other cases, KFold is used.
'''
from sklearn.model_selection import GridSearchCV, KFold

path = os.getcwd()#get current path
path_up1Dir = os.path.dirname(path)#go up one directory
dataset = pd.read_excel(path_up1Dir +'/x_TotalArea_y_MVPA/x_TotalArea_y_MVPA.xlsx')#to import the preprocessed dataset into a variable

print(dataset)

     Unnamed: 0  Subject  Vision Surface         X         Y
0             0        1    Open    Firm -0.922794  0.702471
1             1        1  Closed    Firm -0.929073  0.702471
2             2        1    Open    Foam -0.673078  0.702471
3             3        1  Closed    Foam -0.453918  0.702471
4             4        2    Open    Firm -0.941319 -0.368982
..          ...      ...     ...     ...       ...       ...
639         639      162    Open    Foam  0.332269 -0.287135
640         640      163  Closed    Firm -0.806407 -0.517795
641         641      163    Open    Foam -0.112514 -0.517795
642         642      163    Open    Firm -0.797555 -0.517795
643         643      163  Closed    Foam -0.049677 -0.517795

[644 rows x 6 columns]


In [2]:
X = dataset['X']
y = dataset['Y']
other = pd.DataFrame(dataset.iloc[:,:4])
print(X, y, other)

0     -0.922794
1     -0.929073
2     -0.673078
3     -0.453918
4     -0.941319
         ...   
639    0.332269
640   -0.806407
641   -0.112514
642   -0.797555
643   -0.049677
Name: X, Length: 644, dtype: float64 0      0.702471
1      0.702471
2      0.702471
3      0.702471
4     -0.368982
         ...   
639   -0.287135
640   -0.517795
641   -0.517795
642   -0.517795
643   -0.517795
Name: Y, Length: 644, dtype: float64      Unnamed: 0  Subject  Vision Surface
0             0        1    Open    Firm
1             1        1  Closed    Firm
2             2        1    Open    Foam
3             3        1  Closed    Foam
4             4        2    Open    Firm
..          ...      ...     ...     ...
639         639      162    Open    Foam
640         640      163  Closed    Firm
641         641      163    Open    Foam
642         642      163    Open    Firm
643         643      163  Closed    Foam

[644 rows x 4 columns]


In [4]:
X_ = X.values.reshape(-1, 1) # some algorithms need to reshape X if X has a single feature

In [5]:
'''
# n_splits = how many folds; 
# shuffle = True: to shuffle the data instead of stratifying the folds; 
                    to remove the ordering of the samples by label 
                    since each fold corresponds to one of the classes in the dataset which results to nothing can be learned.
# random_state = 0: to fix the random_state to get a reproducible shuffling;
                    otherwise, each run can yield a different result since each time different split would be used.
'''
kfold = KFold(n_splits=10, shuffle=True, random_state=0) 

In [15]:
# Random Forest
'''
Why RandomForestRegressor not RandomForestClassifier?
    - first, there is a huge difference between classifiers and regressors. 
        Classifiers predict a set of specified labels. 
            e.g. Email Spam Detection, where to classify whether an email is either spam (1) or not spam(0) . 
        Regressors predict some value, which could be almost anything. 
            (predict real valued outputs which vary and dont require outputs predicted to be in a fixed set)
            e.g. Predicting the runs scored by a team in a cricket match.
    - in our case, we want to use data to predict MVPA_minutes.week from any of the four Total Area (cm²) average scores.
    - so, we are predicting a quantity instead of a label, which means we should use regressor.
'''

'''
The main parameters to adjust when using these methods is n_estimators and max_features.
Another important parameter is max_depth.
# n_estimators = the number of trees in the forest. 
                The larger the better, but also the longer it takes to compute.
# max_features = the number of features that are selected, aka how random each tree is.
                The lower the greater the reduction of variance, but also the greater the increase in bias.
                range = (0,1]
# max_depth = 可以生成多少层叶子, to reduce the complexity of each tree to prevent overfitting, often not deeper than five splits
# bootstrap = True: 使用自助采样法; False: 使用整个数据集.
'''

# Set param_grid, aka the main parameters in RandomForestRegressor
param_grid_RandomForestRegressor = {
    'n_estimators':np.arange(10,100,10), # 从10到100，每隔10取一个
    'max_features':np.arange(0.1,1,0.1), # 从0.1到1，每隔0.1取一个
    'max_depth':np.arange(1,10),
    'bootstrap':[True,False]
}

# GridSearchCV
rfg = GridSearchCV(estimator=RandomForestRegressor(), # algorithm - Random Forest Regressor
                    param_grid=param_grid_RandomForestRegressor, # specify the parameters to search over using a dict or list of dictionaries
                    cv=kfold # 10-Fold & this is not a classifier
                    )

# Build the model
rfg.fit(X_, y)

# Output the best parameter, cross-validation score, estimator, and the index of best estimator.
print("\n------------------ RandomForestRegressor Model")
print("Best parameter: {}".format(rfg.best_params_))
print("Best cross-validation score: {:.2f}".format(rfg.best_score_))
print("Best estimator: {}".format(rfg.best_estimator_))
print("The Index of Best estimator: {}".format(rfg.best_index_))


------------------ RandomForestRegressor Model
Best parameter: {'bootstrap': True, 'max_depth': 2, 'max_features': 0.8, 'n_estimators': 30}
Best cross-validation score: -0.02
Best estimator: RandomForestRegressor(max_depth=2, max_features=0.8, n_estimators=30)
The Index of Best estimator: 146


In [16]:
# Use the above optimal parameters to build new model
rfg = RandomForestRegressor(max_depth=2, max_features=0.8, n_estimators=30).fit(X_, y)

In [17]:
# Draw the Random Forest
rfg_tree = rfg.estimators_[0] # Draw the first tree

## Export a decision tree w/ color in DOT format.
dot_data = tree.export_graphviz(decision_tree=rfg_tree, # decision tree classifier
                               filled=True, # True: paint nodes to indicate majority class for classification, extremity of values for regression, or purity of node for multi-output
                               rounded=True, # True: draw node boxes with rounded corners and use Helvetica fonts instead of Times-Roman
                               special_characters=True # True: do not ignore special characters for PostScript compatibility
                               )

## Output the graph
graph = graphviz.Source(dot_data)
graph.render(filename='x_TotalArea_y_MVPA_RandomForest', 
            directory=path_up1Dir +'/x_TotalArea_y_MVPA')

'/Users/zclalala/Documents/GitHub/project-posture/x_TotalArea_y_MVPA/x_TotalArea_y_MVPA_RandomForest.pdf'