In [1]:
import os
import numpy as np
HOUSING_PATH = os.path.join("datasets", "housing")
import pandas as pd
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

dataset = load_housing_data(housing_path=HOUSING_PATH)

Data Preparation

In [2]:
#Lib to categorial atributes
from sklearn.preprocessing import OneHotEncoder
cat_encoder = OneHotEncoder()

#mission values
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")

#Custom Transformer
from sklearn.base import BaseEstimator, TransformerMixin
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self # nothing else to do
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

y = dataset['median_house_value'].values
housing = dataset.drop("median_house_value", axis=1)
housing_num = housing.drop("ocean_proximity", axis=1)

num_pipeline = Pipeline([
 ('imputer', SimpleImputer(strategy="median")), #missin values
 ('attribs_adder', CombinedAttributesAdder()), #creating new attributes
 ('std_scaler', StandardScaler()), # scaling
 ])


num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]
full_pipeline = ColumnTransformer([
 ("num", num_pipeline, num_attribs), #num_pipeline
 ("cat", OneHotEncoder(), cat_attribs), #one hot encoder
 ])
housing_prepared = full_pipeline.fit_transform(housing)


Split dataset into Training Set and Test Set


In [5]:
from sklearn.model_selection import train_test_split

y = dataset['median_house_value'].values
X = housing_prepared.copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
#Training on the training set
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

#predicting the test results
y_pred = lin_reg.predict(X_test)

#Evaluating

In [23]:
#Evaluating
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

print("r2:", r2_score(y_test, y_pred))
print("mean_squared_error (MSE):", mean_squared_error(y_test, y_pred))
print("root_mean_squared_error (MSE):", np.sqrt(mean_squared_error(y_test, y_pred)))

r2: 0.5970176824350761
mean_squared_error (MSE): 5280716470.094017
root_mean_squared_error (MSE): 72668.53837868226
