# Library

In [None]:
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
import warnings 
warnings.filterwarnings('ignore')

# Data

In [None]:
boston_data = datasets.load_boston()
data = pd.DataFrame(boston_data['data'], columns=boston_data['feature_names'])
target = pd.DataFrame(boston_data['target'], columns=['Target'])
df = pd.concat([data, target], axis=1)

In [None]:
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,Target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


# Split data

In [None]:
X = df.drop(['Target'],axis=1)
y = df['Target']

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size = 0.2, random_state = 100)

# Random Forest + GridSearchCV

In [None]:
#모델 생성
model=RandomForestRegressor()

#최적화할 파라미터 범위 
rf_param = {'n_estimators': [50, 150, 200],
             'max_depth': [6, 8, 10],
             'min_samples_leaf' : [1, 2, 4],
             'min_samples_split' : [2, 4, 6]
             }

grid_rf = GridSearchCV(model, param_grid=rf_param, scoring = 'neg_root_mean_squared_error', cv=5, n_jobs=-1)
grid_rf.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'max_depth': [6, 8, 10], 'min_samples_leaf': [1, 2, 4],
                         'min_samples_split': [2, 4, 6],
                         'n_estimators': [50, 150, 200]},
             scoring='neg_root_mean_squared_error')

In [None]:
result_df = pd.DataFrame(grid_rf.cv_results_)

In [None]:
result_df[['params', 'mean_test_score', 'rank_test_score']].head()

Unnamed: 0,params,mean_test_score,rank_test_score
0,"{'max_depth': 6, 'min_samples_leaf': 1, 'min_s...",-3.905743,52
1,"{'max_depth': 6, 'min_samples_leaf': 1, 'min_s...",-3.847476,36
2,"{'max_depth': 6, 'min_samples_leaf': 1, 'min_s...",-3.83656,33
3,"{'max_depth': 6, 'min_samples_leaf': 1, 'min_s...",-3.850183,38
4,"{'max_depth': 6, 'min_samples_leaf': 1, 'min_s...",-3.834834,32


In [None]:
print('best params:', grid_rf.best_params_)
print('best score:', grid_rf.best_score_)

best params: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
best score: -3.7214146768950385


# Prediction

In [None]:
pred = grid_rf.predict(X_test)

#RMSE 
rmse = np.sqrt(MSE(y_test, pred))
print("RMSE : {}".format(rmse))

RMSE : 3.2769292854801346
