# Prepare the Data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
file_path = 'datasets/housing/housing.csv'

In [2]:
housing = pd.read_csv(file_path)
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [3]:
np.random.seed(42)
rnd_idx = np.random.permutation(len(housing))
test_idx = rnd_idx[:int(0.2*len(housing))]
train_idx = rnd_idx[int(0.2*len(housing)):]
housing_train, housing_test = housing.loc[train_idx], housing.loc[test_idx]

In [4]:
print("train_size:", len(housing_train), "test_size", len(housing_test))

train_size: 16512 test_size 4128


In [5]:
from sklearn.preprocessing import LabelBinarizer, Imputer, StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribs):
        self.attribs = attribs
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribs].values

In [6]:
nums_attribs = ['longitude', 'latitude', 'housing_median_age',
               'total_rooms', 'total_bedrooms', 'population',
               'households', 'median_income']
cats_attribs = ['ocean_proximity']
num_pipe = Pipeline([
    ('selectors', DataFrameSelector(nums_attribs)),
    ('imputer', Imputer(strategy='median')),
    ('standard', StandardScaler())
])
cat_pipe = Pipeline([
    ('selectors', DataFrameSelector(cats_attribs)),
    ('labelBinarizer', LabelBinarizer()),
])
union_pipe = FeatureUnion(transformer_list=[
    ('num_pipe', num_pipe),
    ('cat_pipe', cat_pipe),
])

In [7]:
X_train, y_train= union_pipe.fit_transform(housing_train), housing_train['median_house_value'].values
X_test, y_test = union_pipe.fit_transform(housing_test), housing_test['median_house_value'].values

# 1 

In [8]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
param_grid = [
    {'kernel':['linear'], 'C':[0.1, 1, 10]},
    {'kernel':['rbf'], 'C':[0.1, 1, 10], 'gamma':[0.1, 1, 10]}
]
svr_clf = SVR()
grid_search = GridSearchCV(svr_clf, param_grid, cv=3,
                          scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)
grid_search.fit(X_train[2000:], y_train[2000:])

Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] C=0.1, kernel=linear ............................................
[CV] C=0.1, kernel=linear ............................................
[CV] C=0.1, kernel=linear ............................................
[CV] C=1, kernel=linear ..............................................
[CV] ............................. C=0.1, kernel=linear, total=   7.6s
[CV] C=1, kernel=linear ..............................................
[CV] ............................. C=0.1, kernel=linear, total=   7.5s
[CV] C=1, kernel=linear ..............................................
[CV] ............................. C=0.1, kernel=linear, total=   7.7s
[CV] C=10, kernel=linear .............................................
[CV] ............................... C=1, kernel=linear, total=   7.7s
[CV] C=10, kernel=linear .............................................
[CV] ............................... C=1, kernel=linear, total=   7.3s
[CV] ...........

[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:  2.0min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid=[{'kernel': ['linear'], 'C': [0.1, 1, 10]}, {'kernel': ['rbf'], 'C': [0.1, 1, 10], 'gamma': [0.1, 1, 10]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_mean_squared_error', verbose=2)

In [9]:
grid_search.best_estimator_

SVR(C=10, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [10]:
from sklearn.metrics import mean_squared_error
y_pred = grid_search.best_estimator_.predict(X_test)

In [11]:
np.sqrt(mean_squared_error(y_pred, y_test))

82516.457765924773

# 2 

In [20]:
from sklearn.model_selection import RandomizedSearchCV
rand_search = RandomizedSearchCV(svr_clf, param_grid, cv=3,
                          scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)

In [None]:
rand_search.fit(X_train, y_train)

# 3 

In [22]:
cor_matrix = housing.corr()
cor_matrix['median_house_value'].sort_values(ascending=False)

median_house_value    1.000000
median_income         0.688075
total_rooms           0.134153
housing_median_age    0.105623
households            0.065843
total_bedrooms        0.049686
population           -0.024650
longitude            -0.045967
latitude             -0.144160
Name: median_house_value, dtype: float64

In [29]:
class TopKImportance(BaseEstimator, TransformerMixin):
    def __init__(self, importances, k):
        self.importances = importances
        self.k = k
    def fit(self, X, y=None):
        self.importance_indexes = self.importances[:self.k]
        return self
    def transform(self, X):
        return X[:, self.importance_indexes]

In [31]:
importances = [7, 3, 2, 6, 4, 5, 0, 1]
prepare_topkfeatues_pipe = Pipeline([
    ('union_pipeline', union_pipe),
    ('topkfeatures', TopKImportance(importances, 3))
])
X_train,y_train = prepare_topkfeatues_pipe.fit_transform(housing_train), housing_train['median_house_value'].values

In [32]:
X_train.shape

(16512, 3)

# 4

In [39]:
fit_predict_pipe = Pipeline([
    ('union', union_pipe),
    ('svm_reg', SVR(**grid_search.best_params_)),
])

In [40]:
X_train, y_train= union_pipe.transform(housing_train), housing_train['median_house_value'].values

In [None]:
fit_predict_pipe.fit(X_train, y_train)

# 5