In [None]:
import numpy as np
from numpy import arange
from matplotlib import pyplot
import pandas as pd
from pandas import  set_option
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

In [None]:
# import data
dataset = pd.read_csv("data.csv", index_col=0)
dataset.head()

In [None]:
# Partition data set
Y = dataset.Score
X = dataset.drop('Score', axis = 1)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,test_size=0.3, random_state=7)

In [None]:
# Search optimal hyperparameter
from pprint import pprint
n_estimators_range=[int(x) for x in np.linspace(start=50,stop=3000,num=60)]
max_features_range=['auto','sqrt']
max_depth_range=[int(x) for x in np.linspace(10,500,num=50)]
max_depth_range.append(None)
min_samples_split_range=[2,5,10]
min_samples_leaf_range=[1,2,4,8]
bootstrap_range=[True,False]

random_forest_hp_range={'n_estimators':n_estimators_range,
                        'max_features':max_features_range,
                        'max_depth':max_depth_range,
                        'min_samples_split':min_samples_split_range,
                        'min_samples_leaf':min_samples_leaf_range
                        # 'bootstrap':bootstrap_range
                        }
pprint(random_forest_hp_range)

In [None]:
random_forest_model_test_base=RandomForestRegressor()
random_forest_model_test_random=RandomizedSearchCV(estimator=random_forest_model_test_base,
                                                   param_distributions=random_forest_hp_range,
                                                   n_iter=200,
                                                   n_jobs=-1,
                                                   cv=3,
                                                   verbose=1,
                                                   random_state=5
                                                   )
random_forest_model_test_random.fit(X_train,Y_train)

best_hp_now=random_forest_model_test_random.best_params_
pprint(best_hp_now)

In [None]:
# Grid Search
random_forest_hp_range_2={'n_estimators':[60,100,200],
                          'max_features':(10,11),
                          'max_depth':[300,400,450],
                          'min_samples_split':[2,3],
                          'min_samples_leaf':[1,2]
                          #'bootstrap':bootstrap_range
                          }
random_forest_model_test_2_base=RandomForestRegressor()
random_forest_model_test_2_random=GridSearchCV(estimator=random_forest_model_test_2_base,
                                               param_grid=random_forest_hp_range_2,
                                               cv=3,
                                               verbose=1,
                                               n_jobs=-1)
random_forest_model_test_2_random.fit(X_train,Y_train)

best_hp_now_2=random_forest_model_test_2_random.best_params_
pprint(best_hp_now_2)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import mean_absolute_error

In [None]:
# Evaluation Algorithm-Evaluation Criteria
num_folds = 5
scoring1 = 'neg_mean_squared_error'
scoring2 = 'r2'
scoring3 = 'neg_mean_absolute_error'

In [None]:
# 
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
rf= RandomForestRegressor(max_depth = 300, max_features = 11, min_samples_leaf = 1, 
                          min_samples_split = 2, n_estimators = 60)
rf.fit(rescaledX, Y_train)
score = cross_val_score(rf, rescaledX, Y_train, cv=5, scoring=scoring2)
print(score.mean(), "+/-", score.std())

In [None]:
# Performance of the model on the test set training set
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
rf= RandomForestRegressor(max_depth = 300, max_features = 11, min_samples_leaf = 1, 
                          min_samples_split = 2, n_estimators = 60)
rf.fit(rescaledX, Y_train)

# Evaluate the training set
rescaledX_train = scaler.transform(X_train)
pre_train = rf.predict(rescaledX_train)
print('Training_data，MSE：%s' % (mean_squared_error(Y_train, pre_train)))
print('Training_data，MAE：%s' % (mean_absolute_error(Y_train, pre_train)))
print('Training_data，R2：%s' %  (r2_score(Y_train, pre_train)))

# Evaluate the testing set
rescaledX_test = scaler.transform(X_test)
pre_test = rf.predict(rescaledX_test)
print('Testing_data，MSE：%s' % (mean_squared_error(Y_test, pre_test)))
print('Testing_data，MAE：%s' % (mean_absolute_error(Y_test, pre_test)))
print('Testing_data，R2：%s' % (r2_score(Y_test, pre_test)))

In [None]:
#Data saving
#training set 
trainSt = pd.DataFrame(data = Y_train)
trainSt.to_csv('/Users/fuyang/Desktop/RF_train.csv')
#training set 
pretrainSt = pd.DataFrame(data = pre_train)
pretrainSt.to_csv('/Users/fuyang/Desktop/RF_predictions_train.csv')

#testing set 
testSt = pd.DataFrame(data = Y_test)
testSt.to_csv('/Users/fuyang/Desktop/RF_test.csv')
#testing set
pretestSt = pd.DataFrame(data = pre_test)
pretestSt.to_csv('/Users/fuyang/Desktop/RF_predictions_test.csv')