In [82]:
import os
import numpy as np
HOUSING_PATH = os.path.join("datasets", "housing")
import pandas as pd
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

dataset = load_housing_data(housing_path=HOUSING_PATH)

Data Preparation

In [83]:
#Lib to categorial atributes
from sklearn.preprocessing import OneHotEncoder
cat_encoder = OneHotEncoder()

#mission values
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")

#Custom Transformer
from sklearn.base import BaseEstimator, TransformerMixin
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self # nothing else to do
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [84]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

y = dataset['median_house_value'].values
housing = dataset.drop("median_house_value", axis=1)
housing_num = housing.drop("ocean_proximity", axis=1)

num_pipeline = Pipeline([
 ('imputer', SimpleImputer(strategy="median")), #missin values
 ('attribs_adder', CombinedAttributesAdder()), #creating new attributes
 ('std_scaler', StandardScaler()), # scaling
 ])


num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]
full_pipeline = ColumnTransformer([
 ("num", num_pipeline, num_attribs), #num_pipeline
 ("cat", OneHotEncoder(), cat_attribs), #one hot encoder
 ])
housing_prepared = full_pipeline.fit_transform(housing)


Split dataset into Training Set and Test Set


In [85]:
from sklearn.model_selection import train_test_split

y = dataset['median_house_value'].values
X = housing_prepared.copy()

test_examples = np.random.choice(np.arange(len(dataset)), 5, False)
X_test_examples = np.take(X, test_examples, axis=0)
y_test_examples = np.take(y, test_examples, axis=0)

X = np.delete(X, test_examples, axis=0)
y = np.delete(y, test_examples, axis=0)

print(dataset.iloc[test_examples])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

       longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
11683    -117.99     33.86                20.0       3540.0           906.0   
9518     -123.19     39.15                16.0       2577.0           495.0   
14106    -117.10     32.75                23.0       1858.0           551.0   
9479     -123.35     39.40                27.0       1321.0           338.0   
11417    -117.96     33.70                23.0       4417.0           740.0   

       population  households  median_income  median_house_value  \
11683      2898.0       876.0         3.0252            178000.0   
9518       1232.0       488.0         2.6012            125600.0   
14106      1506.0       492.0         1.7446             85200.0   
9479        779.0       327.0         1.8500             71800.0   
11417      1865.0       693.0         5.3428            279300.0   

      ocean_proximity  
11683       <1H OCEAN  
9518        <1H OCEAN  
14106      NEAR OCEAN  
9479        <1H OCEA

LINEAR REGRESSION

In [86]:
#Training on the training set
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

#predicting the test results
lin_reg_y_pred = lin_reg.predict(X_test)

#Evaluating
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

print("EVALUATING LINEAR REGRESSION")
print("r2:", r2_score(y_test, lin_reg_y_pred))
print("mean_squared_error (MSE):", mean_squared_error(y_test, lin_reg_y_pred))
print("root_mean_squared_error (MSE):", np.sqrt(mean_squared_error(y_test, lin_reg_y_pred)))

EVALUATING LINEAR REGRESSION
r2: 0.6293852716541182
mean_squared_error (MSE): 4890815285.364055
root_mean_squared_error (MSE): 69934.36412354127


In [91]:
lin_reg.predict(X_test_examples)

array([171863.55150346, 160951.23355368, 147810.97715818, 147761.19868742,
       278168.61240026])

In [87]:
y_test_examples

array([178000., 125600.,  85200.,  71800., 279300.])

DECISION TREE

In [92]:
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(X_train, y_train)
tree_reg_y_pred = tree_reg.predict(X_test)

print("EVALUATING DECISION TREES")
print("r2:", r2_score(y_test, tree_reg_y_pred))
print("mean_squared_error (MSE):", mean_squared_error(y_test, tree_reg_y_pred))
print("root_mean_squared_error (MSE):", np.sqrt(mean_squared_error(y_test, tree_reg_y_pred)))


EVALUATING DECISION TREES
r2: 0.6209816034514068
mean_squared_error (MSE): 5001714247.967047
root_mean_squared_error (MSE): 70722.79864348588


RANDOM FOREST

In [None]:
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(X_train, y_train)
random_reg_y_pred = forest_reg.predict(X_test)

print("EVALUATING RANDOM FOREST")
print("r2:", r2_score(y_test, random_reg_y_pred))
print("mean_squared_error (MSE):", mean_squared_error(y_test, random_reg_y_pred))
print("root_mean_squared_error (MSE):", np.sqrt(mean_squared_error(y_test, random_reg_y_pred)))

RANDOM FOREST - HYPERPARAMETER TUNNING

In [None]:
#GRID SEARCH
from sklearn.model_selection import GridSearchCV

param_grid = [
 {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
 {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
 ]

forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, param_grid, cv=5,scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(X_train, y_train)

print("grid_search.best_params_ : ", grid_search.best_params_)
print("grid_search.best_estimator_ : ", grid_search.best_estimator_)

forest_grid_search_model = grid_search.best_estimator_
forest_grid_search_predictions = forest_grid_search_model.predict(X_test)

print("EVALUATING RANDOM FOREST - GRID SEARCH")
print("r2:", r2_score(y_test, forest_grid_search_predictions))
print("mean_squared_error (MSE):", mean_squared_error(y_test, forest_grid_search_predictions))
print("root_mean_squared_error (MSE):", np.sqrt(mean_squared_error(y_test, forest_grid_search_predictions)))