# Exercise Chapter 5: Predicting housing prices with SVM

Important: the exercise is using an SVM REGRESSOR, NOT classificator

In [2]:
# Boilerplate code / load libraries
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import pandas as pd
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

In [4]:
HOUSING_PATH = os.path.join("datasets", "housing")
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

housing = load_housing_data()

In [5]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)


In [7]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16512 entries, 14196 to 15795
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           16512 non-null  float64
 1   latitude            16512 non-null  float64
 2   housing_median_age  16512 non-null  float64
 3   total_rooms         16512 non-null  float64
 4   total_bedrooms      16512 non-null  float64
 5   population          16512 non-null  float64
 6   households          16512 non-null  float64
 7   median_income       16512 non-null  float64
 8   median_house_value  16512 non-null  float64
 9   ocean_proximity     16512 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.4+ MB


In [8]:
X_train = train_set.drop("median_house_value", axis=1) # drop labels for training set
y_train = train_set["median_house_value"].copy()

In [9]:
X_train.shape

(16512, 9)

In [15]:
# Imputing missing values
from sklearn.impute import SimpleImputer

In [16]:
from sklearn.base import BaseEstimator, TransformerMixin

# column index
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

# Code for getting the column indices
# col_names = "total_rooms", "total_bedrooms", "population", "households"
# rooms_ix, bedrooms_ix, population_ix, households_ix = [
#    housing.columns.get_loc(c) for c in col_names] # get the column indices

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [17]:
# Define num_pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])


In [19]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

num_attribs = list(X_train.select_dtypes(include=[np.number]).columns.values)
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

X_train_prep = full_pipeline.fit_transform(X_train)

In [20]:
from sklearn.svm import SVR
from sklearn.model_selection import RandomizedSearchCV

svm_reg = SVR(kernel="rbf")


svm_random = {'gamma': [0.1, 1, 100, 1000],
              'C': [0.001, 0.1, 1, 10, 1000]}

rs_random = RandomizedSearchCV(estimator = svm_reg,
                               param_distributions = svm_random,
                               n_iter = 10,
                               cv = 5,
                               verbose= 2,
                               random_state=42,
                               n_jobs = -1)


In [21]:
rs_random.fit(X_train_prep, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END .................................C=0.001, gamma=0.1; total time=  54.1s
[CV] END .................................C=0.001, gamma=0.1; total time=  55.5s
[CV] END .................................C=0.001, gamma=0.1; total time=  58.6s
[CV] END .................................C=0.001, gamma=0.1; total time=  59.3s
[CV] END .................................C=0.001, gamma=0.1; total time=  46.4s
[CV] END ....................................C=1000, gamma=1; total time=  46.2s
[CV] END ....................................C=1000, gamma=1; total time=  50.1s
[CV] END ....................................C=1000, gamma=1; total time=  52.9s
[CV] END ....................................C=1000, gamma=1; total time=  46.1s
[CV] END ...................................C=10, gamma=1000; total time=  45.1s
[CV] END ....................................C=1000, gamma=1; total time=  50.4s
[CV] END ...................................C=10

RandomizedSearchCV(cv=5, estimator=SVR(), n_jobs=-1,
                   param_distributions={'C': [0.001, 0.1, 1, 10, 1000],
                                        'gamma': [0.1, 1, 100, 1000]},
                   random_state=42, verbose=2)

In [22]:
rs_random.best_params_

{'gamma': 0.1, 'C': 1000}

In [24]:
new_svr_model = SVR(kernel="rbf", **rs_random.best_params_)


In [25]:
new_svr_model.fit(X_train_prep, y_train)

SVR(C=1000, gamma=0.1)

In [26]:
X_test = test_set.drop("median_house_value", axis=1) # drop labels for training set
y_test = test_set["median_house_value"].copy()

X_test_prepared = full_pipeline.transform(X_test)

In [28]:
from sklearn.metrics import mean_squared_error

final_predictions = new_svr_model.predict(X_test_prepared)
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

In [29]:
final_rmse

70857.56291661152