In [34]:
import pandas as pd
import numpy as np

with open('../datasets/housing/housing.csv') as f:
    data  = pd.read_csv(f)

In [35]:
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit

### Estratificación según ingreso medio

In [36]:
def stratified_data_split(data):
    """ hace un split que refleja la realidad dando iguales proporciones en la muestra sobre un atributo """
    data['income_category'] = np.ceil(data['median_income']/1.5)
    data['income_category'].where(data['income_category'] < 5, 5.0, inplace=True)
    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    train_index, test_index = next(split.split(data, data['income_category']))
    strat_train_set = data.loc[train_index]
    strat_test_set = data.loc[test_index]
    strat_test_set.drop(['income_category'], axis=1, inplace=True)
    strat_train_set.drop(['income_category'], axis=1, inplace=True)
    return strat_train_set, strat_test_set

### reemplazando valores nulos

In [37]:
# relleno los valores nulos con la mediana, pero también hay otras posibildades

from sklearn.preprocessing import Imputer

imputer = Imputer(strategy="median")

# tiro todas las columnas no numericas porque no puedo calcular la mediana en ellas
housing_num = housing.drop('ocean_proximity',axis = 1)
imputer.fit(housing_num)

Imputer(axis=0, copy=True, missing_values='NaN', strategy='median', verbose=0)

In [38]:
#imputer.statistics_

X = imputer.transform(housing_num)
# X es un array numpy, lo vuelvo a poner en un data frame
housing_tr = pd.DataFrame(X, columns=housing_num.columns)


### encodeando categorias a atributos numericos

In [39]:
#To combine the two steps in one we have a LabelBinarizer
from sklearn.preprocessing import LabelBinarizer
encoder = LabelBinarizer(sparse_output=True)
housing_cat = housing['ocean_proximity']
housing_cat_encoded_1hot = encoder.fit_transform(housing_cat)


### derivando atributos con mejor correlación

In [40]:
# puedo hacer un Transformer que me agregue atributos, y lo puedo poner luego en un pipeline
from sklearn.base import BaseEstimator, TransformerMixin

rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6

# Base Estimator me da un get_params y set_params
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
            self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix]/X[:, household_ix]
        population_per_household = X[:, population_ix]/X[:, household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix]/X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]
attr_adder = CombinedAttributesAdder(False)
housing_extra_attr = attr_adder.transform(housing.values)

### pipeline

In [41]:
from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    """ handles pandas dataframes and returns columns as numpy arrays. There is also a sklearn-pandas egg, and it
    may be that something is added to sklearn in the future as ColumnTransfrmer"""
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

In [42]:
# combinando todo en pipelines combinadas
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import StandardScaler


num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

num_pipeline = Pipeline([('selector', DataFrameSelector(num_attribs)),
                         ('imputer', Imputer(strategy="median")),
                        ('attribs_adder', CombinedAttributesAdder()),
                        ('std_scaleer', StandardScaler())])
cat_pipeline = Pipeline([('selector', DataFrameSelector(cat_attribs)),
                         ('label_binarizer', LabelBinarizer())])

full_pipeline = FeatureUnion(transformer_list = [('num_pipeline', num_pipeline),
                                                ('cat_pipeline', cat_pipeline)])



### Preparando el conjunto de datos

In [43]:
from sklearn.linear_model import LinearRegression
linear_reg = LinearRegression()
linear_reg.fit(housing_prepared, housing_labels)


# empizo de nuevo haciendo un split
strat_train_set, strat_test_set = stratified_data_split(data)
housing_labels = strat_train_set['median_house_value'].copy()
housing = strat_train_set.drop('median_house_value',axis=1)

housing_prepared = full_pipeline.fit_transform(housing)

Una vez que probamos muchos modelos podemos hacer una shortlist con los que mejor funcionan y empezar a tunear los parametros. Para tunear estos parametros hay varios metodos en sklearn

### Sklearn - RandomizedSearchCV:
  Le podemos decir que parametros queremos probar con que valores y prueba todas las combinaciones haciend cross validatoin



In [44]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_distribs = {
        'n_estimators': randint(low=1, high=200),
        'max_features': randint(low=1, high=8),
    }

forest_reg = RandomForestRegressor(random_state=42)


forest_reg = RandomForestRegressor()
grid_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,
                                n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42)


grid_search.fit(housing_prepared, housing_labels)


RandomizedSearchCV(cv=5, error_score='raise',
          estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False),
          fit_params={}, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f00cef699e8>, 'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f00cef69828>},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score=True, scoring='neg_mean_squared_error',
          verbose=0)

In [45]:
grid_search.best_params_

{'max_features': 7, 'n_estimators': 122}

In [46]:
grid_search.best_estimator_

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=7, max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=122, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [47]:
cv = grid_search.cv_results_
for mean_score, params in zip(cv["mean_test_score"], cv["params"]):
    print(np.sqrt(-mean_score), params)

49269.7823995 {'n_estimators': 180, 'max_features': 7}
51380.9329938 {'n_estimators': 15, 'max_features': 5}
50567.547672 {'n_estimators': 72, 'max_features': 3}
50828.9222 {'n_estimators': 21, 'max_features': 5}
49205.8400836 {'n_estimators': 122, 'max_features': 7}
50450.3206934 {'n_estimators': 75, 'max_features': 3}
50487.2298794 {'n_estimators': 88, 'max_features': 3}
49492.7931153 {'n_estimators': 100, 'max_features': 5}
50320.355334 {'n_estimators': 150, 'max_features': 3}
64960.9778768 {'n_estimators': 2, 'max_features': 5}


### Inspeccionar la importancia de los features

In [48]:
extra_attr = ["rooms_per_hhold", "pop_per_hold","bedrooms_per_room"]
cat_one_hot_attr = list(encoder.classes_)
attrs = num_attribs + extra_attr + cat_one_hot_attr

feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances= sorted(zip(feature_importances, attrs), reverse=True)

[(0.35738452872586113, 'median_income'),
 (0.15020733899777669, 'INLAND'),
 (0.11101973018843508, 'pop_per_hold'),
 (0.072438106771196442, 'longitude'),
 (0.070637751814209296, 'bedrooms_per_room'),
 (0.066334953324289403, 'latitude'),
 (0.047923884871605683, 'rooms_per_hhold'),
 (0.042394990316477921, 'housing_median_age'),
 (0.016489524332660405, 'total_rooms'),
 (0.015928719496315455, 'total_bedrooms'),
 (0.015715319099229488, 'population'),
 (0.014886219474101148, 'households'),
 (0.011142833729596461, '<1H OCEAN'),
 (0.0048294694706388797, 'NEAR OCEAN'),
 (0.0026070496236398084, 'NEAR BAY'),
 (5.9579763966874252e-05, 'ISLAND')]

### Evaluar el modelo final en el test set

In [52]:
from sklearn.metrics import mean_squared_error

final_model = grid_search.best_estimator_


X_test = strat_test_set.drop("median_house_value", axis=1)
Y_test = strat_test_set["median_house_value"].copy()

X_test_prepared = full_pipeline.transform(X_test)
final_predictions = final_model.predict(X_test_prepared)

final_mse = mean_squared_error(Y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
print(final_rmse)


47028.197084


### SVM Regressor

In [57]:
from sklearn.svm import SVR
from scipy.stats import expon, reciprocal

param_distribs = {
        'kernel': ['linear', 'rbf'],
        'C': reciprocal(20, 200000),
        'gamma': expon(scale=1.0),
    }

forest_reg = SVR()
grid_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,
                                n_iter=50, cv=5, scoring='neg_mean_squared_error',verbose=2, n_jobs=4, random_state=42)


grid_search.fit(housing_prepared, housing_labels)


Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] gamma=3.01012143092, C=629.782329591, kernel=linear .............
[CV] gamma=3.01012143092, C=629.782329591, kernel=linear .............
[CV] gamma=3.01012143092, C=629.782329591, kernel=linear .............
[CV] gamma=3.01012143092, C=629.782329591, kernel=linear .............
[CV]  gamma=3.01012143092, C=629.782329591, kernel=linear, total=  19.5s
[CV] gamma=3.01012143092, C=629.782329591, kernel=linear .............
[CV]  gamma=3.01012143092, C=629.782329591, kernel=linear, total=  20.1s
[CV] gamma=0.908446969632, C=26290.2064643, kernel=rbf ...............
[CV]  gamma=3.01012143092, C=629.782329591, kernel=linear, total=  20.4s
[CV] gamma=0.908446969632, C=26290.2064643, kernel=rbf ...............
[CV]  gamma=3.01012143092, C=629.782329591, kernel=linear, total=  20.6s
[CV] gamma=0.908446969632, C=26290.2064643, kernel=rbf ...............
[CV]  gamma=3.01012143092, C=629.782329591, kernel=linear, total=  19.7s
[CV] 

[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:  8.5min


[CV] . gamma=0.36275372946, C=108.304882388, kernel=rbf, total=  47.5s
[CV] gamma=0.0233325235983, C=21.3449536726, kernel=linear ...........
[CV]  gamma=0.0233325235983, C=21.3449536726, kernel=linear, total=  29.8s
[CV] gamma=0.0233325235983, C=21.3449536726, kernel=linear ...........
[CV] . gamma=0.36275372946, C=108.304882388, kernel=rbf, total=  46.4s
[CV] gamma=0.0233325235983, C=21.3449536726, kernel=linear ...........
[CV]  gamma=0.0233325235983, C=21.3449536726, kernel=linear, total=  27.4s
[CV] gamma=0.150234528727, C=5603.27031743, kernel=rbf ...............
[CV]  gamma=0.0233325235983, C=21.3449536726, kernel=linear, total=  30.3s
[CV] gamma=0.150234528727, C=5603.27031743, kernel=rbf ...............
[CV]  gamma=0.0233325235983, C=21.3449536726, kernel=linear, total=  30.8s
[CV] gamma=0.150234528727, C=5603.27031743, kernel=rbf ...............
[CV]  gamma=0.0233325235983, C=21.3449536726, kernel=linear, total=  30.3s
[CV] gamma=0.150234528727, C=5603.27031743, kernel=rbf ..

[CV] gamma=0.373546581658, C=8935.50563595, kernel=rbf ...............
[CV]  gamma=0.373546581658, C=8935.50563595, kernel=rbf, total=  43.8s
[CV] gamma=0.838636245625, C=135.767758248, kernel=linear ............
[CV]  gamma=0.373546581658, C=8935.50563595, kernel=rbf, total=  45.5s
[CV] gamma=0.838636245625, C=135.767758248, kernel=linear ............
[CV]  gamma=0.838636245625, C=135.767758248, kernel=linear, total=  31.6s
[CV] gamma=0.838636245625, C=135.767758248, kernel=linear ............
[CV]  gamma=0.373546581658, C=8935.50563595, kernel=rbf, total=  45.7s
[CV] gamma=0.838636245625, C=135.767758248, kernel=linear ............
[CV]  gamma=0.838636245625, C=135.767758248, kernel=linear, total=  35.0s
[CV] gamma=0.838636245625, C=135.767758248, kernel=linear ............
[CV]  gamma=0.373546581658, C=8935.50563595, kernel=rbf, total= 1.0min
[CV] gamma=1.49224537714, C=151136.202825, kernel=rbf ................
[CV]  gamma=0.838636245625, C=135.767758248, kernel=linear, total=  30.

[CV] gamma=1.47521452604, C=16483.8505298, kernel=linear .............
[CV]  gamma=0.221697602314, C=22.7692794106, kernel=rbf, total=  42.2s
[CV] gamma=1.47521452604, C=16483.8505298, kernel=linear .............
[CV]  gamma=1.47521452604, C=16483.8505298, kernel=linear, total=  37.7s
[CV] gamma=1.47521452604, C=16483.8505298, kernel=linear .............
[CV]  gamma=1.47521452604, C=16483.8505298, kernel=linear, total=  41.2s
[CV] gamma=1.47521452604, C=16483.8505298, kernel=linear .............
[CV]  gamma=0.221697602314, C=22.7692794106, kernel=rbf, total=  41.9s
[CV] gamma=1.05290408458, C=101445.668813, kernel=rbf ................
[CV]  gamma=1.47521452604, C=16483.8505298, kernel=linear, total=  48.0s
[CV] gamma=1.05290408458, C=101445.668813, kernel=rbf ................
[CV]  gamma=1.47521452604, C=16483.8505298, kernel=linear, total=  42.5s
[CV] gamma=1.05290408458, C=101445.668813, kernel=rbf ................


[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed: 55.7min


[CV]  gamma=1.47521452604, C=16483.8505298, kernel=linear, total=  49.9s
[CV] gamma=1.05290408458, C=101445.668813, kernel=rbf ................
[CV] . gamma=1.05290408458, C=101445.668813, kernel=rbf, total= 3.2min
[CV] gamma=1.05290408458, C=101445.668813, kernel=rbf ................
[CV] . gamma=1.05290408458, C=101445.668813, kernel=rbf, total= 2.9min
[CV] gamma=0.976301191712, C=56681.8085903, kernel=rbf ...............
[CV] . gamma=1.05290408458, C=101445.668813, kernel=rbf, total= 3.8min
[CV] gamma=0.976301191712, C=56681.8085903, kernel=rbf ...............
[CV] . gamma=1.05290408458, C=101445.668813, kernel=rbf, total= 4.0min
[CV] gamma=0.976301191712, C=56681.8085903, kernel=rbf ...............
[CV]  gamma=0.976301191712, C=56681.8085903, kernel=rbf, total= 1.2min
[CV] gamma=0.976301191712, C=56681.8085903, kernel=rbf ...............
[CV]  gamma=0.976301191712, C=56681.8085903, kernel=rbf, total= 1.3min
[CV] gamma=0.976301191712, C=56681.8085903, kernel=rbf ...............
[CV]

[CV] gamma=2.86213836765, C=129.800060414, kernel=linear .............
[CV]  gamma=0.528281974883, C=198.700478181, kernel=linear, total=  36.0s
[CV] gamma=2.86213836765, C=129.800060414, kernel=linear .............
[CV]  gamma=0.528281974883, C=198.700478181, kernel=linear, total=  35.7s
[CV] gamma=2.86213836765, C=129.800060414, kernel=linear .............
[CV]  gamma=0.528281974883, C=198.700478181, kernel=linear, total=  34.8s
[CV] gamma=2.86213836765, C=129.800060414, kernel=linear .............
[CV]  gamma=2.86213836765, C=129.800060414, kernel=linear, total=  34.6s
[CV] gamma=2.86213836765, C=129.800060414, kernel=linear .............
[CV]  gamma=2.86213836765, C=129.800060414, kernel=linear, total=  33.5s
[CV] gamma=0.1758083585, C=288.426929959, kernel=rbf .................
[CV]  gamma=2.86213836765, C=129.800060414, kernel=linear, total=  33.9s
[CV] gamma=0.1758083585, C=288.426929959, kernel=rbf .................
[CV]  gamma=2.86213836765, C=129.800060414, kernel=linear, tot

[Parallel(n_jobs=4)]: Done 250 out of 250 | elapsed: 87.4min finished


RandomizedSearchCV(cv=5, error_score='raise',
          estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False),
          fit_params={}, iid=True, n_iter=50, n_jobs=4,
          param_distributions={'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f00d835b4e0>, 'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f00d8313f28>, 'kernel': ['linear', 'rbf']},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score=True, scoring='neg_mean_squared_error',
          verbose=2)

In [58]:
final_model = grid_search.best_estimator_



X_test = strat_test_set.drop("median_house_value", axis=1)
Y_test = strat_test_set["median_house_value"].copy()

X_test_prepared = full_pipeline.transform(X_test)
final_predictions = final_model.predict(X_test_prepared)

final_mse = mean_squared_error(Y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
print(final_rmse)


52490.037939


In [62]:
negative_mse = grid_search.best_score_
rmse = np.sqrt(-negative_mse)
print(rmse)
print(grid_search.best_params_)
print(grid_search.best_estimator_)

54767.990537
{'gamma': 0.26497040005002437, 'C': 157055.10989448498, 'kernel': 'rbf'}
SVR(C=157055.10989448498, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
  gamma=0.26497040005002437, kernel='rbf', max_iter=-1, shrinking=True,
  tol=0.001, verbose=False)


In [59]:
from sklearn.externals import joblib

joblib.dump(final_model, 'final_model_data_cleaning.pkl') 
final_model = joblib.load('final_model_data_cleaning.pkl') 

['final_model_data_cleaning.pkl']

####Ej 3. Select only most important attributes

In [213]:
feature_importances = np.array([  7.33442355e-02,   6.29090705e-02,   4.11437985e-02,
         1.46726854e-02,   1.41064835e-02,   1.48742809e-02,
         1.42575993e-02,   3.66158981e-01,   5.64191792e-02,
         1.08792957e-01,   5.33510773e-02,   1.03114883e-02,
         1.64780994e-01,   6.02803867e-05,   1.96041560e-03,
         2.85647464e-03])
print(feature_importances)
def indices_of_top_k(arr, k):
    return np.sort(np.argpartition(np.array(arr), -k)[-k:])

class TopFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, feature_importances=[], k=3):
            self.k = k
            self.feature_importances = feature_importances
    def fit(self, X, y=None):
        print("feature_importances:", self.feature_importances)
        self.feature_indices_ = indices_of_top_k(self.feature_importances, self.k)
        return self
    def transform(self, X):
        return X[:, self.feature_indices_]
    
class SupervisionFriendlyLabelBinarizer(LabelBinarizer):
    """LabelBinarizer's fit_transform() method only accepts one parameter y (because it was meant for labels, not predictors), so it does not work in a pipeline where the final estimator is a supervised estimator because in this case its fit() method takes two parameters X and y.
       This hack creates a supervision-friendly LabelBinarizer."""
    def fit_transform(self, X, y=None):
        return super(SupervisionFriendlyLabelBinarizer, self).fit_transform(X)
    

num_pipeline = Pipeline([('selector', DataFrameSelector(num_attribs)),
                         ('imputer', Imputer(strategy="median")),
                        ('attribs_adder', CombinedAttributesAdder()),
                        ('std_scaleer', StandardScaler()),])
cat_pipeline = Pipeline([('selector', DataFrameSelector(cat_attribs)),
                         ('label_binarizer', SupervisionFriendlyLabelBinarizer()),])

full_pipeline = FeatureUnion(transformer_list = [('num_pipeline', num_pipeline),
                                                ('cat_pipeline', cat_pipeline)])


[  7.33442355e-02   6.29090705e-02   4.11437985e-02   1.46726854e-02
   1.41064835e-02   1.48742809e-02   1.42575993e-02   3.66158981e-01
   5.64191792e-02   1.08792957e-01   5.33510773e-02   1.03114883e-02
   1.64780994e-01   6.02803867e-05   1.96041560e-03   2.85647464e-03]


### 4. Single pipeline including prediction

In [214]:
prepare_select_and_predict_pipeline = Pipeline([
    ('preparation', full_pipeline),
      ('feature_selection', TopFeatureSelector(feature_importances, k=3)),
    ('svm_reg', SVR(**grid_search.best_params_))
])
prepare_select_and_predict_pipeline.fit(housing, housing_labels)


feature_importances: [  7.33442355e-02   6.29090705e-02   4.11437985e-02   1.46726854e-02
   1.41064835e-02   1.48742809e-02   1.42575993e-02   3.66158981e-01
   5.64191792e-02   1.08792957e-01   5.33510773e-02   1.03114883e-02
   1.64780994e-01   6.02803867e-05   1.96041560e-03   2.85647464e-03]


Pipeline(steps=[('preparation', FeatureUnion(n_jobs=1,
       transformer_list=[('num_pipeline', Pipeline(steps=[('selector', DataFrameSelector(attribute_names=['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income'])), ('imputer', Imputer(... gamma=0.26497040005002437, kernel='rbf', max_iter=-1, shrinking=True,
  tol=0.001, verbose=False))])

In [215]:
### configuring a pipeline params
from sklearn.model_selection import GridSearchCV
param_grid = [
        {'preparation__num_pipeline__imputer__strategy': ['mean', 'median', 'most_frequent'],
         'feature_selection__k': [3]}
]

grid_search_prep = GridSearchCV(prepare_select_and_predict_pipeline, param_grid, cv=5,
                                scoring='neg_mean_squared_error', verbose=2, n_jobs=4)
grid_search_prep.fit(housing, housing_labels)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] feature_selection__k=3, preparation__num_pipeline__imputer__strategy=mean 
[CV] feature_selection__k=3, preparation__num_pipeline__imputer__strategy=mean 
[CV] feature_selection__k=3, preparation__num_pipeline__imputer__strategy=mean 
[CV] feature_selection__k=3, preparation__num_pipeline__imputer__strategy=mean 
feature_importances: [  7.33442355e-02   6.29090705e-02   4.11437985e-02   1.46726854e-02
   1.41064835e-02   1.48742809e-02   1.42575993e-02   3.66158981e-01
   5.64191792e-02   1.08792957e-01   5.33510773e-02   1.03114883e-02
   1.64780994e-01   6.02803867e-05   1.96041560e-03   2.85647464e-03]feature_importances: [  7.33442355e-02   6.29090705e-02   4.11437985e-02   1.46726854e-02
   1.41064835e-02   1.48742809e-02   1.42575993e-02   3.66158981e-01
   5.64191792e-02   1.08792957e-01   5.33510773e-02   1.03114883e-02
   1.64780994e-01   6.02803867e-05   1.96041560e-03   2.85647464e-03]
feature_importances: [  7

[Parallel(n_jobs=4)]: Done  15 out of  15 | elapsed:  2.8min finished


feature_importances: [  7.33442355e-02   6.29090705e-02   4.11437985e-02   1.46726854e-02
   1.41064835e-02   1.48742809e-02   1.42575993e-02   3.66158981e-01
   5.64191792e-02   1.08792957e-01   5.33510773e-02   1.03114883e-02
   1.64780994e-01   6.02803867e-05   1.96041560e-03   2.85647464e-03]


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(steps=[('preparation', FeatureUnion(n_jobs=1,
       transformer_list=[('num_pipeline', Pipeline(steps=[('selector', DataFrameSelector(attribute_names=['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income'])), ('imputer', Imputer(... gamma=0.26497040005002437, kernel='rbf', max_iter=-1, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params={}, iid=True, n_jobs=4,
       param_grid=[{'feature_selection__k': [3], 'preparation__num_pipeline__imputer__strategy': ['mean', 'median', 'most_frequent']}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_mean_squared_error', verbose=2)