# 6. MODEL TUNING AND CHOICE

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [2]:
%store -r X_train X_test y_train y_test

### 6.1 HYPERPARAMETER TUNING

###### 6.1.1 RANDOM FOREST REGRESSION

In [3]:
param_grid = [
    {'n_estimators': [1, 10, 100], 'max_features': [2, 4, 6, 8, 10]},
    {'bootstrap': [True], 'n_estimators': [2, 10], 'max_features': [2, 3, 4]},
  ]

In [4]:
rf_model = RandomForestRegressor()

In [5]:
rf_grid = GridSearchCV(rf_model, param_grid = param_grid, cv = 3, verbose = 2, n_jobs = 3, scoring='neg_mean_squared_error')

In [6]:
rf_grid.fit(X_train, y_train);

Fitting 3 folds for each of 21 candidates, totalling 63 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed:   38.1s
[Parallel(n_jobs=3)]: Done  63 out of  63 | elapsed:  1.2min finished


In [7]:
np.sqrt(-rf_grid.best_score_)

51021.83802737542

### 6.2 ANALYZING MODEL ERRORS

In [8]:
feature_importances = pd.DataFrame(rf_grid.best_estimator_.feature_importances_).transpose()
feature_importances.columns = X_train.columns
feature_importances = feature_importances.transpose()
feature_importances.sort_values(by=0)

Unnamed: 0,0
INLAND,0.00022
NEAR BAY,0.001897
NEAR OCEAN,0.007231
ISLAND,0.007764
households,0.016784
total_bedrooms,0.019535
total_rooms,0.019675
population,0.026309
rooms_per_house,0.040354
housing_median_age,0.049664


* The following features can be thrown out without losing barely any predictive power:
    * INLAND
    * NEAR BAY
    * ISLAND
    * NEAR OCEAN
    * households
    * total_bedrooms
    * population

### 6.3 MEASURING PERFORMANCE

In [9]:
final_model = rf_grid.best_estimator_
final_predictions = final_model.predict(X_test)
final_mse = mean_squared_error(final_predictions, y_test)
final_rmse = np.sqrt(final_mse)
final_rmse

46949.45692323385

 * We have reached the minimum goal of RMSE < 50.000 USD
 * We were unable to reach the desired value of RMSE < 35.000 USD. More features and data may be needed.