In [1]:
import os
import tarfile
import urllib

import numpy as np
import pandas as pd

In [2]:
DOWNLOAD_ROOT = 'https://raw.githubusercontent.com/ageron/handson-ml2/master/'
HOUSING_PATH = os.path.join('datasets', 'housing')
HOUSING_URL = DOWNLOAD_ROOT + 'datasets/housing/housing.tgz'

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, 'housing.tgz')
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()
    
fetch_housing_data()

In [3]:
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, 'housing.csv')
    return pd.read_csv(csv_path)

In [4]:
housing = load_housing_data()
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [5]:
housing['income_cat'] = pd.cut(housing['median_income'],
                              bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                              labels=[1, 2, 3, 4, 5])

In [6]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, test_index in split.split(housing, housing['income_cat']):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [7]:
housing = strat_train_set.drop('median_house_value', axis=1)
housing_labels = strat_train_set['median_house_value'].copy()

In [8]:
from sklearn.base import BaseEstimator, TransformerMixin

rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedroom_per_room=True):
        self.add_bedroom_per_room = add_bedroom_per_room
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
        population_per_household = X[:, population_ix] / X[:, household_ix]
        if self.add_bedroom_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

housing_num = housing.drop('ocean_proximity', axis=1)

num_attribs = list(housing_num)
cat_attribs = ['ocean_proximity']

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat', OneHotEncoder(), cat_attribs),
])

housing_prepared = full_pipeline.fit_transform(housing)

In [11]:
from sklearn.svm import SVR
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from scipy.stats import expon, reciprocal

param_random = {
    'kernel': ['linear', 'rbf'],
    'C': reciprocal(20, 200000),
    'gamma': expon(scale=1.0),
}

svm_reg = SVR()

random_search = RandomizedSearchCV(svm_reg, param_random,
                                  n_iter=50, cv=5, scoring='neg_mean_squared_error',
                                  verbose=3, random_state=42)

random_search.fit(housing_prepared, housing_labels)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] C=629.782329591372, gamma=3.010121430917521, kernel=linear ......


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  C=629.782329591372, gamma=3.010121430917521, kernel=linear, score=-4581680769.219, total=   4.5s
[CV] C=629.782329591372, gamma=3.010121430917521, kernel=linear ......


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    4.5s remaining:    0.0s


[CV]  C=629.782329591372, gamma=3.010121430917521, kernel=linear, score=-5353877784.160, total=   4.4s
[CV] C=629.782329591372, gamma=3.010121430917521, kernel=linear ......


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    9.0s remaining:    0.0s


[CV]  C=629.782329591372, gamma=3.010121430917521, kernel=linear, score=-5082055142.489, total=   4.6s
[CV] C=629.782329591372, gamma=3.010121430917521, kernel=linear ......
[CV]  C=629.782329591372, gamma=3.010121430917521, kernel=linear, score=-4642150688.223, total=   4.6s
[CV] C=629.782329591372, gamma=3.010121430917521, kernel=linear ......
[CV]  C=629.782329591372, gamma=3.010121430917521, kernel=linear, score=-5071771849.711, total=   4.3s
[CV] C=26290.206464300216, gamma=0.9084469696321253, kernel=rbf ......
[CV]  C=26290.206464300216, gamma=0.9084469696321253, kernel=rbf, score=-4193101125.538, total=   8.5s
[CV] C=26290.206464300216, gamma=0.9084469696321253, kernel=rbf ......
[CV]  C=26290.206464300216, gamma=0.9084469696321253, kernel=rbf, score=-4695781329.599, total=   8.0s
[CV] C=26290.206464300216, gamma=0.9084469696321253, kernel=rbf ......
[CV]  C=26290.206464300216, gamma=0.9084469696321253, kernel=rbf, score=-4636206819.696, total=   8.1s
[CV] C=26290.206464300216, 

[CV]  C=157055.10989448498, gamma=0.26497040005002437, kernel=rbf, score=-3181803200.746, total=  23.0s
[CV] C=27652.464358739708, gamma=0.2227358621286903, kernel=linear ...
[CV]  C=27652.464358739708, gamma=0.2227358621286903, kernel=linear, score=-4569030183.225, total=   9.5s
[CV] C=27652.464358739708, gamma=0.2227358621286903, kernel=linear ...
[CV]  C=27652.464358739708, gamma=0.2227358621286903, kernel=linear, score=-5472768728.288, total=  11.5s
[CV] C=27652.464358739708, gamma=0.2227358621286903, kernel=linear ...
[CV]  C=27652.464358739708, gamma=0.2227358621286903, kernel=linear, score=-5053619942.060, total=   9.7s
[CV] C=27652.464358739708, gamma=0.2227358621286903, kernel=linear ...
[CV]  C=27652.464358739708, gamma=0.2227358621286903, kernel=linear, score=-4639884017.501, total=   9.5s
[CV] C=27652.464358739708, gamma=0.2227358621286903, kernel=linear ...
[CV]  C=27652.464358739708, gamma=0.2227358621286903, kernel=linear, score=-5019057933.489, total=   8.6s
[CV] C=1713

[CV]  C=135.76775824842434, gamma=0.838636245624803, kernel=linear, score=-5308467287.676, total=   4.2s
[CV] C=135.76775824842434, gamma=0.838636245624803, kernel=linear ....
[CV]  C=135.76775824842434, gamma=0.838636245624803, kernel=linear, score=-5185654415.463, total=   4.2s
[CV] C=135.76775824842434, gamma=0.838636245624803, kernel=linear ....
[CV]  C=135.76775824842434, gamma=0.838636245624803, kernel=linear, score=-4727689049.557, total=   4.2s
[CV] C=135.76775824842434, gamma=0.838636245624803, kernel=linear ....
[CV]  C=135.76775824842434, gamma=0.838636245624803, kernel=linear, score=-5220672613.333, total=   4.1s
[CV] C=151136.20282548846, gamma=1.4922453771381408, kernel=rbf ......
[CV]  C=151136.20282548846, gamma=1.4922453771381408, kernel=rbf, score=-3972229798.317, total= 1.2min
[CV] C=151136.20282548846, gamma=1.4922453771381408, kernel=rbf ......
[CV]  C=151136.20282548846, gamma=1.4922453771381408, kernel=rbf, score=-4370699734.925, total= 1.3min
[CV] C=151136.20282

[CV]  C=24547.601975705915, gamma=0.22153944050588595, kernel=rbf, score=-3079747365.675, total=   6.8s
[CV] C=24547.601975705915, gamma=0.22153944050588595, kernel=rbf .....
[CV]  C=24547.601975705915, gamma=0.22153944050588595, kernel=rbf, score=-3513156378.744, total=   6.9s
[CV] C=22.76927941060928, gamma=0.22169760231351215, kernel=rbf ......
[CV]  C=22.76927941060928, gamma=0.22169760231351215, kernel=rbf, score=-12367980763.608, total=   6.6s
[CV] C=22.76927941060928, gamma=0.22169760231351215, kernel=rbf ......
[CV]  C=22.76927941060928, gamma=0.22169760231351215, kernel=rbf, score=-13072741709.100, total=   6.5s
[CV] C=22.76927941060928, gamma=0.22169760231351215, kernel=rbf ......
[CV]  C=22.76927941060928, gamma=0.22169760231351215, kernel=rbf, score=-13413901819.905, total=   6.6s
[CV] C=22.76927941060928, gamma=0.22169760231351215, kernel=rbf ......
[CV]  C=22.76927941060928, gamma=0.22169760231351215, kernel=rbf, score=-13308105291.437, total=   6.5s
[CV] C=22.76927941060

[CV]  C=1888.9148509967113, gamma=0.739678838777267, kernel=linear, score=-4572909932.442, total=   4.6s
[CV] C=1888.9148509967113, gamma=0.739678838777267, kernel=linear ....
[CV]  C=1888.9148509967113, gamma=0.739678838777267, kernel=linear, score=-5419481240.038, total=   4.6s
[CV] C=1888.9148509967113, gamma=0.739678838777267, kernel=linear ....
[CV]  C=1888.9148509967113, gamma=0.739678838777267, kernel=linear, score=-5068263308.297, total=   4.6s
[CV] C=1888.9148509967113, gamma=0.739678838777267, kernel=linear ....
[CV]  C=1888.9148509967113, gamma=0.739678838777267, kernel=linear, score=-4638817105.971, total=   4.7s
[CV] C=1888.9148509967113, gamma=0.739678838777267, kernel=linear ....
[CV]  C=1888.9148509967113, gamma=0.739678838777267, kernel=linear, score=-5041831922.270, total=   4.6s
[CV] C=55.53838911232773, gamma=0.578634378499143, kernel=linear .....
[CV]  C=55.53838911232773, gamma=0.578634378499143, kernel=linear, score=-4906778532.636, total=   4.2s
[CV] C=55.538389

[CV]  C=926.9787684096649, gamma=2.147979593060577, kernel=rbf, score=-13176469127.866, total=   6.4s
[CV] C=926.9787684096649, gamma=2.147979593060577, kernel=rbf ........
[CV]  C=926.9787684096649, gamma=2.147979593060577, kernel=rbf, score=-13070558165.185, total=   6.4s
[CV] C=926.9787684096649, gamma=2.147979593060577, kernel=rbf ........
[CV]  C=926.9787684096649, gamma=2.147979593060577, kernel=rbf, score=-13011527735.717, total=   6.5s
[CV] C=33946.157064934, gamma=2.2642426492862313, kernel=linear ......
[CV]  C=33946.157064934, gamma=2.2642426492862313, kernel=linear, score=-4569386012.376, total=  13.9s
[CV] C=33946.157064934, gamma=2.2642426492862313, kernel=linear ......
[CV]  C=33946.157064934, gamma=2.2642426492862313, kernel=linear, score=-5474686344.138, total=  10.6s
[CV] C=33946.157064934, gamma=2.2642426492862313, kernel=linear ......
[CV]  C=33946.157064934, gamma=2.2642426492862313, kernel=linear, score=-5052485087.449, total=  10.7s
[CV] C=33946.157064934, gamma=

[Parallel(n_jobs=1)]: Done 250 out of 250 | elapsed: 42.7min finished


RandomizedSearchCV(cv=5, estimator=SVR(), n_iter=50,
                   param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fd34b071e20>,
                                        'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fd34c8a84f0>,
                                        'kernel': ['linear', 'rbf']},
                   random_state=42, scoring='neg_mean_squared_error',
                   verbose=3)

In [12]:
def indices_of_top_k(arr, k):
    return np.sort(np.argpartition(np.array(arr), -k)[-k:])

class TopFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, feature_importances, k):
        self.feature_importances = feature_importances
        self.k = k
        
    def fit(self, X, y=None):
        self.feature_indices_ = indices_of_top_k(self.feature_importances, self.k)
        return self
    
    def transform(self, X):
        return X[:, self.feature_indices]