In [1]:
# fine tuning the model
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

def loaddata(housing_path = os.path.join("../datasets","housing")):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)
housing = loaddata()

In [2]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
longitude             20640 non-null float64
latitude              20640 non-null float64
housing_median_age    20640 non-null float64
total_rooms           20640 non-null float64
total_bedrooms        20433 non-null float64
population            20640 non-null float64
households            20640 non-null float64
median_income         20640 non-null float64
median_house_value    20640 non-null float64
ocean_proximity       20640 non-null object
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [3]:
housing["income_cat"] = pd.cut(housing["median_income"],
                              bins = [0.0, 1.5, 3.0, 4.5, 6.0, np.inf],
                              labels = [1, 2, 3, 4, 5])

In [4]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, train_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train = housing.loc[train_index]
    strat_test = housing.loc[test_index]

for set_ in (strat_train, strat_test):
    set_.drop("income_cat", axis=1, inplace=True)

In [5]:
# making copy of our training data 
# to incapsulate the testing data

housing = strat_train.drop("median_house_value", axis=1)
housing_labels = strat_train["median_house_value"].copy()

In [6]:
# imputing missing values
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")
housing_num = housing.drop("ocean_proximity", axis=1)
imputer.fit(housing_num)

SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='median', verbose=0)

In [7]:
from sklearn.base import BaseEstimator, TransformerMixin
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self # nothing else to do
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
            bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]


In [8]:
# feature scaling
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])

housing_num_tr = num_pipeline.fit_transform(housing_num)

In [9]:
# column transformer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs)
])

housing_prepared = full_pipeline.fit_transform(housing)

In [10]:
# fine tuning the model
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

param_grid = [
{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
{'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
]
forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
scoring='neg_mean_squared_error',
return_train_score=True)
grid_search.fit(housing_prepared, housing_labels)


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestRegressor(bootstrap=True, criterion='mse',
                                             max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators='warn', n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='warn', n_jobs=None,
             param_grid=[{'max_features': [2, 4, 6, 8],


In [11]:
grid_search.best_params_

{'max_features': 6, 'n_estimators': 30}

In [12]:
grid_search.best_estimator_

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features=6, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=30,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [13]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

66095.72189461709 {'max_features': 2, 'n_estimators': 3}
59105.17175233206 {'max_features': 2, 'n_estimators': 10}
56384.21903371621 {'max_features': 2, 'n_estimators': 30}
63687.371352889866 {'max_features': 4, 'n_estimators': 3}
56414.5059163544 {'max_features': 4, 'n_estimators': 10}
54202.1450031865 {'max_features': 4, 'n_estimators': 30}
63431.513658422184 {'max_features': 6, 'n_estimators': 3}
56510.990884490355 {'max_features': 6, 'n_estimators': 10}
53505.31293310913 {'max_features': 6, 'n_estimators': 30}
64100.805816022345 {'max_features': 8, 'n_estimators': 3}
56196.680273613376 {'max_features': 8, 'n_estimators': 10}
53937.98005230636 {'max_features': 8, 'n_estimators': 30}
65603.17081279102 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
57140.740385395686 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
64019.1051171729 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
55885.02304369635 {'bootstrap': False, 'max_features': 3, 'n_estimators':

In [14]:
# best models and their errors
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

array([0.07455656, 0.06662436, 0.03938345, 0.01774088, 0.01718588,
       0.01913732, 0.01726559, 0.33551821, 0.06147275, 0.10893937,
       0.07473666, 0.0081417 , 0.15127243, 0.0026057 , 0.00541916])

In [15]:
# run the model on test set