# Chapter 2 Exercises



## Exercise 1

### Setup

In [1]:
import numpy as np
import pandas as pd

### Download data

In [2]:
from pathlib import Path
import tarfile
import urllib.request

def load_housing_data():
    tarball_path = Path("datasets/housing.tgz")
    if not tarball_path.is_file():
        Path("datasets").mkdir(parents=True, exist_ok=True)
        url = "https://github.com/ageron/data/raw/main/housing.tgz"
        urllib.request.urlretrieve(url, tarball_path)
        with tarfile.open(tarball_path) as housing_tarball:
            housing_tarball.extractall(path="datasets", filter="data")
    return pd.read_csv(Path("datasets/housing/housing.csv"))

housing_full = load_housing_data()

In [3]:
housing_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [4]:
housing_full.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


### Split data - training and validation

In [5]:
housing_full['income_cat'] = pd.cut(
    housing_full['median_income'],
    bins=[0., 1.5, 3.0, 4.5, 6.0, np.inf],
    labels=[1, 2, 3, 4, 5])

In [38]:
from sklearn.model_selection import train_test_split

strat_train_set, strat_test_set = train_test_split(
  housing_full,
  test_size=0.2,
  random_state=42,
  stratify=housing_full['income_cat'])

In [7]:
housing_full['income_cat'].value_counts() / len(housing_full)

income_cat
3    0.350581
2    0.318847
4    0.176308
5    0.114438
1    0.039826
Name: count, dtype: float64

In [8]:
strat_test_set['income_cat'].value_counts() / len(strat_test_set)

income_cat
3    0.350533
2    0.318798
4    0.176357
5    0.114341
1    0.039971
Name: count, dtype: float64

In [40]:
# Not using income_cat anymore - already stratified
# Remove so data is back to original state
for set_ in (strat_train_set, strat_test_set):
    # Code might not run in order, ensure column exists
    if ("income_cat" in set_.columns):
        set_.drop("income_cat", axis=1, inplace=True)

housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

### Define pipeline

Define transformation pipelines to prepare the data for machine learning.

In [19]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import rbf_kernel

class ClusterSimilarity(BaseEstimator, TransformerMixin):
    """
    Find similar clusters using RBF kernel based on KMeans cluster centers.
    """
    def __init__(self, n_clusters=10, gamma=1.0, random_state=None):
        self.n_clusters = n_clusters
        self.gamma = gamma
        self.random_state = random_state

    def fit(self, X, y=None, sample_weight=None):
        self.kmeans_ = KMeans(self.n_clusters, random_state=self.random_state)
        self.kmeans_.fit(X, sample_weight=sample_weight)
        return self  # always return self!

    def transform(self, X):
        return rbf_kernel(X, self.kmeans_.cluster_centers_, gamma=self.gamma)

    def get_feature_names_out(self, names=None):
        return [f"Cluster {i} similarity" for i in range(self.n_clusters)]

In [68]:
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn import set_config

# Show the pipeline diagram
set_config(display='diagram')

def column_ratio(X):
    """
    Calculate the ratio between the first and second columns of a 2D array.

    Args:
        X: A 2D numpy array or matrix with at least 2 columns.

    Returns:
        numpy.ndarray: A 2D array containing the element-wise ratio of
                       column 0 divided by column 1.
    """
    return X[:, [0]] / X[:, [1]]

def ratio_name(function_transformer, feature_names_in):
    return ["ratio"] # feature names out

def ratio_pipeline():
    return make_pipeline(
        SimpleImputer(strategy="median"),
        FunctionTransformer(column_ratio, feature_names_out=ratio_name),
        StandardScaler())

log_pipeline = make_pipeline(
    SimpleImputer(strategy="median"),
    FunctionTransformer(np.log, feature_names_out="one-to-one"),
    StandardScaler())

cat_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore"))

cluster_simil = ClusterSimilarity(n_clusters=10, gamma=1., random_state=42)
default_num_pipeline = make_pipeline(SimpleImputer(strategy="median"),
                                     StandardScaler())

preprocessing = ColumnTransformer([
        ("bedrooms", ratio_pipeline(), ["total_bedrooms", "total_rooms"]),
        ("rooms_per_house", ratio_pipeline(), ["total_rooms", "households"]),
        ("people_per_house", ratio_pipeline(), ["population", "households"]),
        ("log", log_pipeline, ["total_bedrooms", "total_rooms", "population",
                            "households", "median_income"]),
        ("geo", cluster_simil, ["latitude", "longitude"]),
        ("cat", cat_pipeline, make_column_selector(dtype_include="object")),
    ],
    remainder=default_num_pipeline)

preprocessing


0,1,2
,transformers,"[('bedrooms', ...), ('rooms_per_house', ...), ...]"
,remainder,Pipeline(step...ardScaler())])
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,func,<function col...t 0x11ae84680>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,<function rat...t 0x11ae84a40>
,kw_args,
,inv_kw_args,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,func,<function col...t 0x11ae84680>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,<function rat...t 0x11ae84a40>
,kw_args,
,inv_kw_args,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,func,<function col...t 0x11ae84680>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,<function rat...t 0x11ae84a40>
,kw_args,
,inv_kw_args,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,func,<ufunc 'log'>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,'one-to-one'
,kw_args,
,inv_kw_args,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_clusters,10.0
,gamma,1.0
,random_state,42.0

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True


### Transform data

In [41]:
housing_prepared = preprocessing.fit_transform(housing)
housing_prepared.shape

# Code might not run in order, so adding this assertion to ensure correctness
assert housing_prepared.shape == (16512, 24)

In [None]:
# Get a glimpse of the transformed data
pd.DataFrame(housing_prepared, columns=preprocessing.get_feature_names_out()).head(10)

Unnamed: 0,bedrooms__ratio,rooms_per_house__ratio,people_per_house__ratio,log__total_bedrooms,log__total_rooms,log__population,log__households,log__median_income,geo__Cluster 0 similarity,geo__Cluster 1 similarity,...,geo__Cluster 6 similarity,geo__Cluster 7 similarity,geo__Cluster 8 similarity,geo__Cluster 9 similarity,cat__ocean_proximity_<1H OCEAN,cat__ocean_proximity_INLAND,cat__ocean_proximity_ISLAND,cat__ocean_proximity_NEAR BAY,cat__ocean_proximity_NEAR OCEAN,remainder__housing_median_age
0,1.846624,-0.866027,-0.330204,1.324114,0.637892,0.456906,1.310369,-1.071522,0.4581829,1.241847e-14,...,0.0008489216,0.9770322,2.382191e-08,3.819126e-18,0.0,0.0,0.0,1.0,0.0,1.861119
1,-0.508121,0.02455,-0.253616,-0.252671,-0.063576,-0.711654,-0.14203,1.194712,6.511495e-10,0.9579596,...,5.614049e-27,1.260964e-13,0.1103491,0.354761,1.0,0.0,0.0,0.0,0.0,0.90763
2,-0.202155,-0.041193,-0.051041,-0.925266,-0.859927,-0.941997,-0.91303,-0.756981,0.3432506,4.261141e-15,...,0.005641131,0.7303265,2.508224e-08,2.669659e-18,0.0,1.0,0.0,0.0,0.0,0.351428
3,-0.149006,-0.034858,-0.141475,0.952773,0.943475,0.6707,0.925373,-0.912253,2.244844e-15,0.2704823,...,5.913326e-35,5.2012629999999996e-20,0.001712982,0.8874598,0.0,1.0,0.0,0.0,0.0,-0.919891
4,0.963208,-0.666554,-0.306148,1.437622,1.00359,0.719093,1.481464,0.034537,1.090228e-11,0.9422206,...,5.421817e-30,1.04803e-15,0.02568824,0.5279506,0.0,0.0,0.0,0.0,1.0,0.5898
5,-0.743942,0.113646,-0.184066,-0.475568,-0.159448,-0.655784,-0.295737,-0.078518,0.6423987,1.159386e-11,...,0.0001543061,0.3632806,8.154546e-06,2.324587e-14,0.0,1.0,0.0,0.0,0.0,1.861119
6,-0.489249,0.556477,0.050602,-1.058155,-0.85623,-1.076379,-1.255299,1.203677,8.006474e-11,0.9595991,...,1.139e-28,1.11142e-14,0.04431923,0.3744913,1.0,0.0,0.0,0.0,0.0,0.430885
7,-0.389217,-0.024225,-0.158161,1.029712,1.126642,0.806339,1.105551,0.942136,0.7868544,2.160931e-12,...,6.336028e-05,0.7521653,8.513323e-07,1.114788e-15,0.0,0.0,0.0,1.0,0.0,0.5898
8,0.675013,-0.456707,0.128117,0.905123,0.58074,1.167537,0.864504,-0.498515,2.807941e-09,0.8685613,...,7.934107e-26,7.053833e-13,0.1824289,0.2873697,1.0,0.0,0.0,0.0,0.0,-0.522604
9,1.816054,-0.94147,-0.044126,0.40679,-0.246253,0.468905,0.492199,-2.283376,7.083722e-11,0.9876226,...,1.915751e-28,9.000955e-15,0.06364003,0.5620424,1.0,0.0,0.0,0.0,0.0,-0.919891


In [42]:
preprocessing.get_feature_names_out()

array(['bedrooms__ratio', 'rooms_per_house__ratio',
       'people_per_house__ratio', 'log__total_bedrooms',
       'log__total_rooms', 'log__population', 'log__households',
       'log__median_income', 'geo__Cluster 0 similarity',
       'geo__Cluster 1 similarity', 'geo__Cluster 2 similarity',
       'geo__Cluster 3 similarity', 'geo__Cluster 4 similarity',
       'geo__Cluster 5 similarity', 'geo__Cluster 6 similarity',
       'geo__Cluster 7 similarity', 'geo__Cluster 8 similarity',
       'geo__Cluster 9 similarity', 'cat__ocean_proximity_<1H OCEAN',
       'cat__ocean_proximity_INLAND', 'cat__ocean_proximity_ISLAND',
       'cat__ocean_proximity_NEAR BAY', 'cat__ocean_proximity_NEAR OCEAN',
       'remainder__housing_median_age'], dtype=object)

### Select model

In [61]:
from sklearn.svm import SVR

svr = SVR(kernel="rbf", C=0.1)
svm_regressor = make_pipeline(preprocessing, svr)
svm_regressor.fit(housing, housing_labels)

0,1,2
,steps,"[('columntransformer', ...), ('svr', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('bedrooms', ...), ('rooms_per_house', ...), ...]"
,remainder,Pipeline(step...ardScaler())])
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,func,<function col...t 0x1177e6e80>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,<function rat...t 0x1177e7420>
,kw_args,
,inv_kw_args,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,func,<function col...t 0x1177e6e80>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,<function rat...t 0x1177e7420>
,kw_args,
,inv_kw_args,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,func,<function col...t 0x1177e6e80>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,<function rat...t 0x1177e7420>
,kw_args,
,inv_kw_args,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,func,<ufunc 'log'>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,'one-to-one'
,kw_args,
,inv_kw_args,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_clusters,10.0
,gamma,1.0
,random_state,42.0

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,tol,0.001
,C,0.1
,epsilon,0.1
,shrinking,True
,cache_size,200
,verbose,False


### Predictions - first attempt

In [62]:
housing_predictions = svm_regressor.predict(housing).round(-2) # round to nearest hundred
housing_predictions[:5]


array([179200., 179300., 179100., 179100., 179200.])

In [63]:
housing_labels.iloc[:5]

13096    458300.0
14973    483800.0
3785     101700.0
14689     96100.0
20507    361800.0
Name: median_house_value, dtype: float64

Predictions are quite bad!

In [64]:
from sklearn.metrics import root_mean_squared_error
svm_rmse = root_mean_squared_error(housing_labels, housing_predictions)
svm_rmse

118414.95118392352

### Fine tune the model

WARNING: This cell may take a long time to run.

In [69]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

full_pipeline = Pipeline([
    ('preprocessing', preprocessing),
    ('svr', SVR(kernel="linear", C=1.0))
])

param_grid = [
    {
        'svr__kernel': ['linear'],
        'svr__C': [0.1, 1.0, 10.0]
    },
    {
        'svr__kernel': ['rbf'],
        'svr__C': [0.1, 1.0, 10.0],
        'svr__gamma': [0.01, 0.1, 1.0]
    }
]

grid_search = GridSearchCV(full_pipeline, param_grid, cv=3,
                           scoring='neg_root_mean_squared_error')

# May take a long time to run
# Training on a subset to speed up
grid_search.fit(housing[:5000], housing_labels[:5000])

0,1,2
,estimator,Pipeline(step...l='linear'))])
,param_grid,"[{'svr__C': [0.1, 1.0, ...], 'svr__kernel': ['linear']}, {'svr__C': [0.1, 1.0, ...], 'svr__gamma': [0.01, 0.1, ...], 'svr__kernel': ['rbf']}]"
,scoring,'neg_root_mean_squared_error'
,n_jobs,
,refit,True
,cv,3
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('bedrooms', ...), ('rooms_per_house', ...), ...]"
,remainder,Pipeline(step...ardScaler())])
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,func,<function col...t 0x11ae84680>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,<function rat...t 0x11ae84a40>
,kw_args,
,inv_kw_args,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,func,<function col...t 0x11ae84680>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,<function rat...t 0x11ae84a40>
,kw_args,
,inv_kw_args,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,func,<function col...t 0x11ae84680>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,<function rat...t 0x11ae84a40>
,kw_args,
,inv_kw_args,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,func,<ufunc 'log'>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,'one-to-one'
,kw_args,
,inv_kw_args,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_clusters,10.0
,gamma,1.0
,random_state,42.0

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,kernel,'linear'
,degree,3
,gamma,'scale'
,coef0,0.0
,tol,0.001
,C,10.0
,epsilon,0.1
,shrinking,True
,cache_size,200
,verbose,False


In [70]:
grid_search.best_params_

{'svr__C': 10.0, 'svr__kernel': 'linear'}

In [71]:
cv_res = pd.DataFrame(grid_search.cv_results_)
cv_res.sort_values(by="mean_test_score", ascending=False, inplace=True)
cv_res.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_svr__C,param_svr__kernel,param_svr__gamma,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
2,0.471843,0.004902,0.173473,0.0081,10.0,linear,,"{'svr__C': 10.0, 'svr__kernel': 'linear'}",-106135.817586,-107064.824543,-102000.759713,-105067.133947,2201.174185,1
1,0.48406,0.023179,0.17496,0.008118,1.0,linear,,"{'svr__C': 1.0, 'svr__kernel': 'linear'}",-118501.271743,-120584.513069,-114515.902573,-117867.229128,2517.73918,2
10,0.719539,0.035812,0.514072,0.015885,10.0,rbf,0.1,"{'svr__C': 10.0, 'svr__gamma': 0.1, 'svr__kern...",-119529.986427,-121724.044917,-115424.423964,-118892.818436,2610.975977,3
9,0.68877,0.008693,0.523484,0.039371,10.0,rbf,0.01,"{'svr__C': 10.0, 'svr__gamma': 0.01, 'svr__ker...",-120070.397317,-122306.63442,-115985.509091,-119454.180276,2617.116599,4
0,0.573387,0.107277,0.172468,0.003896,0.1,linear,,"{'svr__C': 0.1, 'svr__kernel': 'linear'}",-120211.660829,-122451.44287,-116096.798901,-119586.6342,2631.649509,5


In [72]:
housing_predictions = grid_search.predict(housing.iloc[:5])
housing_predictions[:5]

array([179037.33679453, 225831.0205717 , 149619.53837431, 155044.95479993,
       200373.05755343])

In [73]:
housing_labels[:5]

13096    458300.0
14973    483800.0
3785     101700.0
14689     96100.0
20507    361800.0
Name: median_house_value, dtype: float64

### Model answer

In [74]:
# Retry with model answer
param_grid = [
        {'svr__kernel': ['linear'], 'svr__C': [10., 30., 100., 300., 1000.,
                                               3000., 10000., 30000.0]},
        {'svr__kernel': ['rbf'], 'svr__C': [1.0, 3.0, 10., 30., 100., 300.,
                                            1000.0],
         'svr__gamma': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0]},
    ]

svr_pipeline = Pipeline([("preprocessing", preprocessing), ("svr", SVR())])
grid_search = GridSearchCV(svr_pipeline, param_grid, cv=3,
                           scoring='neg_root_mean_squared_error')
grid_search.fit(housing.iloc[:5000], housing_labels.iloc[:5000])

0,1,2
,estimator,"Pipeline(step...svr', SVR())])"
,param_grid,"[{'svr__C': [10.0, 30.0, ...], 'svr__kernel': ['linear']}, {'svr__C': [1.0, 3.0, ...], 'svr__gamma': [0.01, 0.03, ...], 'svr__kernel': ['rbf']}]"
,scoring,'neg_root_mean_squared_error'
,n_jobs,
,refit,True
,cv,3
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('bedrooms', ...), ('rooms_per_house', ...), ...]"
,remainder,Pipeline(step...ardScaler())])
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,func,<function col...t 0x11ae84680>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,<function rat...t 0x11ae84a40>
,kw_args,
,inv_kw_args,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,func,<function col...t 0x11ae84680>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,<function rat...t 0x11ae84a40>
,kw_args,
,inv_kw_args,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,func,<function col...t 0x11ae84680>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,<function rat...t 0x11ae84a40>
,kw_args,
,inv_kw_args,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,func,<ufunc 'log'>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,'one-to-one'
,kw_args,
,inv_kw_args,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_clusters,10.0
,gamma,1.0
,random_state,42.0

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,kernel,'linear'
,degree,3
,gamma,'scale'
,coef0,0.0
,tol,0.001
,C,10000.0
,epsilon,0.1
,shrinking,True
,cache_size,200
,verbose,False


In [75]:
housing_predictions = grid_search.predict(housing.iloc[:5])
housing_predictions[:5]

array([254014.07049211, 332683.64627887, 109831.47471049, 101464.4792938 ,
       306739.39968746])

In [76]:
svr_grid_search_rmse = -grid_search.best_score_
svr_grid_search_rmse

np.float64(70059.92772805203)

In [77]:
grid_search.best_params_

{'svr__C': 10000.0, 'svr__kernel': 'linear'}

Best parameter value of C was 10,000 which was the maximum, we should try again with higher C values.