In [2]:
from utils import DataFrameSelector, CategoricalEncoder
import pandas as pd
import numpy as np

np.random.seed(42)

df = pd.read_csv("./housing.csv")

# 1. Features Engineering
Divide by 1.5 to limit the number of income categories

In [3]:
df["income_cat"] = np.ceil(df["median_income"] / 1.5)
df["income_cat"].where(df["income_cat"] < 5, 5.0, inplace=True) # Label those above 5 as 5

Create new features:
- rooms_per_household,
- population_per_household,
- bedrooms_per_room

In [4]:
from sklearn.base import BaseEstimator, TransformerMixin

rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
        population_per_household = X[:, population_ix] / X[:, household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [5]:
df_cat = df['ocean_proximity']
df_num = df.drop('ocean_proximity', axis=1).drop('median_house_value', axis=1)

# 2. Cleaning the data

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer

num_attribs = list(df_num)
cat_attribs = ["ocean_proximity"]
y_attribs = ["median_house_value"]

num_pipeline = Pipeline([
        ('selector', DataFrameSelector(num_attribs)),
        ('imputer', Imputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])

cat_pipeline = Pipeline([
        ('selector', DataFrameSelector(cat_attribs)),
        ('cat_encoder', CategoricalEncoder(encoding="onehot-dense")),
    ])

y_pipeline =  Pipeline([
    ('selector', DataFrameSelector(y_attribs)),
    ('imputer', Imputer(strategy="median"))
]) # trick to not change the value. it should actually not have empty value 


from sklearn.pipeline import FeatureUnion
full_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
        ("y_pipeline", y_pipeline),
    ])

In [7]:
df_prepared = full_pipeline.fit_transform(df)

In [8]:
# Transform df back to dataframe
columns = df.columns
new_num_columns = pd.Index(['rooms_per_household', 'population_per_household','bedrooms_per_room'])
new_cat_columns = pd.Index(['cat1', 'cat2', 'cat3', 'cat4', 'cat5'])
columns = columns.drop('median_house_value').drop('ocean_proximity').append(new_num_columns).append(new_cat_columns).append(pd.Index(['median_house_value']))
                         
df_prepared = pd.DataFrame(df_prepared, columns=columns)
df_prepared.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,income_cat,rooms_per_household,population_per_household,bedrooms_per_room,cat1,cat2,cat3,cat4,cat5,median_house_value
0,-1.327835,1.052548,0.982143,-0.804819,-0.972476,-0.974429,-0.977033,2.344766,1.890128,0.628559,-0.049597,-1.029988,0.0,0.0,0.0,1.0,0.0,452600.0
1,-1.322844,1.043185,-0.607019,2.04589,1.357143,0.861439,1.669961,2.332238,1.890128,0.327041,-0.092512,-0.888897,0.0,0.0,0.0,1.0,0.0,358500.0
2,-1.332827,1.038503,1.856182,-0.535746,-0.827024,-0.820777,-0.843637,1.782699,1.890128,1.15562,-0.025843,-1.291686,0.0,0.0,0.0,1.0,0.0,352100.0
3,-1.337818,1.038503,1.856182,-0.624215,-0.719723,-0.766028,-0.733781,0.932968,0.941894,0.156966,-0.050329,-0.449613,0.0,0.0,0.0,1.0,0.0,341300.0
4,-1.337818,1.038503,1.856182,-0.462404,-0.612423,-0.759847,-0.629157,-0.012881,-0.00634,0.344711,-0.085616,-0.639087,0.0,0.0,0.0,1.0,0.0,342200.0


# 3. Split data into test & train sets

In [9]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(df_prepared, test_size=0.2, random_state=42)

y_train = train_set['median_house_value'].copy()
X_train = train_set.drop('median_house_value', axis=1)

y_test = test_set['median_house_value'].copy()
X_test = test_set.drop('median_house_value', axis=1)

# 4. Creating model

In [51]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor()
forest_reg.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

# 5. Fine tuning model

In [54]:
from sklearn.model_selection import GridSearchCV
params_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2,3,6,8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2,3,4]},
]

grid_search = GridSearchCV(forest_reg, params_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Display the results of the GridSearch
cvres = grid_search.cv_results_
for mean_score, std_score, params in zip(cvres['mean_test_score'], cvres['std_test_score'], cvres['params']):
    print(np.sqrt(-mean_score), np.sqrt(std_score), params)

(63112.899747570802, 11371.451300675057, {'max_features': 2, 'n_estimators': 3})
(55816.275560515853, 9934.7916643230001, {'max_features': 2, 'n_estimators': 10})
(52856.474970041723, 8698.2122478551009, {'max_features': 2, 'n_estimators': 30})
(61745.880679843074, 9307.7706895185183, {'max_features': 3, 'n_estimators': 3})
(54341.383728096909, 9217.7861967494755, {'max_features': 3, 'n_estimators': 10})
(52079.555764536228, 7734.0236170931503, {'max_features': 3, 'n_estimators': 30})
(60541.965821743579, 11297.887434091164, {'max_features': 6, 'n_estimators': 3})
(52966.223340123201, 7927.3963767548075, {'max_features': 6, 'n_estimators': 10})
(51056.497175355253, 4087.7113197090707, {'max_features': 6, 'n_estimators': 30})
(59857.531347716453, 11649.640309935896, {'max_features': 8, 'n_estimators': 3})
(52227.789787665992, 11522.216743466393, {'max_features': 8, 'n_estimators': 10})
(50651.127884446243, 6636.9891609611641, {'max_features': 8, 'n_estimators': 30})
(61997.681881928293,

In [56]:
forest_reg = grid_search.best_estimator_