In [None]:
import os
import numpy as np
import pandas as pd
from sklearn import svm, model_selection, \
    impute, base, pipeline, preprocessing, \
    compose
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
HOUSING_PATH = os.path.join("datasets", "housing")

In [3]:
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

In [4]:
housing = load_housing_data()

In [5]:
bins = [0, 1.5, 3, 4.5, 6, np.inf]
labels = range(len(bins) - 1)
housing["income_cat"] = pd.cut(housing["median_income"], bins=bins, labels=labels)

In [6]:
kwargs = {
    "n_splits": 1,
    "test_size": 0.2,
    "random_state": 42
}
splitter = model_selection.StratifiedShuffleSplit(**kwargs)
splits = splitter.split(housing, housing["income_cat"])

In [7]:
for train_ix, test_ix in splits:
    train_raw = housing.loc[train_ix]
    test = housing.loc[test_ix]

In [8]:
train_raw = train_raw.drop("income_cat", axis=1)
test_raw = test.drop("income_cat", axis=1)

In [9]:
housing["rooms_per_household"] = housing["total_rooms"] / housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"] / housing["total_rooms"]
housing["population_per_household"] = housing["population"] / housing["households"]

In [10]:
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(base.BaseEstimator, base.TransformerMixin):
    
    def __init__(self, add_bedrooms_per_room=True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        args = [X, rooms_per_household, population_per_household]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [12]:
tgt = "median_house_value"

train_raw_X = train_raw.drop(tgt, axis=1)
train_y = train_raw[tgt]

test_raw_X = test_raw.drop(tgt, axis=1)
test_y = test_raw[tgt]

In [13]:
num_pipeline = pipeline.Pipeline([
    ("imputer", impute.SimpleImputer(strategy="median")),
    ("attribs_adder", CombinedAttributesAdder()),
    ("std_scalar", preprocessing.StandardScaler())
])

cat_ftrs = ["ocean_proximity"]
num_ftrs = [c for c in train_raw_X.columns if c not in cat_ftrs]

full_pipe = compose.ColumnTransformer([
    ("num", num_pipeline, num_ftrs),
    ("cat", preprocessing.OneHotEncoder(), cat_ftrs)
])

In [14]:
train_X = full_pipe.fit_transform(train_raw_X)

## 1.

In [15]:
svr = svm.SVR()

scoring = "neg_mean_squared_error"
scores = model_selection.cross_val_score(svr, train_X, train_y,
                                        scoring=scoring, cv=10)
rmse_scores = np.sqrt(-scores)

In [17]:
print(f"RMSE of SVR: {round(rmse_scores.mean())}")

RMSE of SVR: 118573.0


In [24]:
param_grid = [
    {"kernel": ["linear"], "C": [.1, 1, 10]},
    {"kernel": ["rbf"], "C": [.1, 1, 10]}
]
svr = svm.SVR()
scoring = "neg_mean_squared_error"
grid_search = model_selection.GridSearchCV(svr, param_grid,
                                          scoring=scoring,
                                          cv=5,
                                           return_train_score=True)
grid_search.fit(train_X, train_y)

GridSearchCV(cv=5, error_score=nan,
             estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3,
                           epsilon=0.1, gamma='scale', kernel='rbf',
                           max_iter=-1, shrinking=True, tol=0.001,
                           verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid=[{'C': [0.1, 1, 10], 'kernel': ['linear']},
                         {'C': [0.1, 1, 10], 'kernel': ['rbf']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring='neg_mean_squared_error', verbose=0)

In [25]:
grid_search.cv_results_

{'mean_fit_time': array([5.76298113, 5.82682843, 5.39119024, 9.26442766, 9.63506465,
        9.32847109]),
 'mean_score_time': array([0.6355165 , 0.6418973 , 0.61636577, 1.36435928, 1.3173573 ,
        1.32326155]),
 'mean_test_score': array([-1.39828181e+10, -1.26724517e+10, -7.16634195e+09, -1.41354200e+10,
        -1.40734826e+10, -1.34843917e+10]),
 'mean_train_score': array([-1.39795414e+10, -1.26673216e+10, -7.15976614e+09, -1.41323297e+10,
        -1.40702598e+10, -1.34807727e+10]),
 'param_C': masked_array(data=[0.1, 1, 10, 0.1, 1, 10],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_kernel': masked_array(data=['linear', 'linear', 'linear', 'rbf', 'rbf', 'rbf'],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 0.1, 'kernel': 'linear'},
  {'C': 1, 'kernel': 'linear'},
  {'C': 10, 'kernel': 'linear'},
  {'C': 0.1, 'kernel':

In [28]:
grid_search.best_estimator_

SVR(C=10, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [43]:
scores = grid_search.cv_results_["mean_test_score"]
rmse_scores = np.sqrt(-scores)
print(f"RMSE: {round(rmse_scores.mean())}")

RMSE: 111520.0


## 2.