### After-class reading
#### Boston Home Prices Prediction and Evaluation
https://www.ritchieng.com/machine-learning-project-boston-home-prices/

In [1]:
%matplotlib inline
import time
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('bmh')
import warnings
warnings.simplefilter('ignore')

In [2]:
from sklearn.datasets import load_boston
X, y = load_boston(return_X_y=True)
print(X.shape, y.shape)

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
rf = RandomForestRegressor(n_estimators=10)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

print(f"R^2 score = {r2_score(y_test, y_pred):.3f}")
print(f"RMSE = {np.sqrt(mean_squared_error(y_test, y_pred)):.3f}")

(506, 13) (506,)
R^2 score = 0.771
RMSE = 4.329


In [3]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

X, y = load_boston(return_X_y=True)

estimator = Pipeline([
    ('rf', RandomForestRegressor()),
])

param_grid = {
    'rf__n_estimators': [5, 10, 20],
    'rf__max_depth': [5, 10, 20, 30],
    'rf__min_samples_split': [2, 5, 10, 20],
}

grid = GridSearchCV(estimator, param_grid, 
                    scoring='r2',
                    n_jobs=-1,
                    cv=5,
                    iid=False)

grid.fit(X, y)

print(f"Best score: {grid.best_score_:.3f}")
print(f"Best parameters: {grid.best_params_}")
print(f"Avg. time to fit: {grid.cv_results_['mean_fit_time'].mean():.3f}")
print(f"Avg. time to predict: {grid.cv_results_['mean_score_time'].mean():.3f}")

Best score: 0.634
Best parameters: {'rf__max_depth': 10, 'rf__min_samples_split': 5, 'rf__n_estimators': 20}
Avg. time to fit: 0.023
Avg. time to predict: 0.001
