In [1]:
import tarfile
import argparse
import os
import numpy as np
import pandas as pd
import requests
import warnings
warnings.filterwarnings("ignore")

import sklearn
from sklearn.svm import SVR
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.model_selection import RandomizedSearchCV, StratifiedShuffleSplit
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline

random_state = 33

In [2]:
PATH = "../data/raw"
OUTPUT_PATH = "../data/processed"

In [3]:
DOWNLOAD_ROOT = "http://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = os.path.join(os.getcwd(), PATH)
OUTPUT_PATH = os.path.join(os.getcwd(), OUTPUT_PATH)
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

In [4]:
def fetch_housing_data(housing_url: str, housing_path: str):
    """Download the housing data

    Parameters
    ----------

    housing_url : str
        Url of the dataset

    housing_path : str
        Path where the dataset is stored
    """
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")

    r = requests.get(housing_url, timeout=300)
    with open(tgz_path, "wb") as f:
        f.write(r.content)

    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()
    
    
def load_housing_data(housing_path: str):
    """Download the housing data

    Parameters
    ----------

    housing_path : str
        Path where the dataset is stored

    Returns
    -------
    df: pd.DataFrame
        It is a pandas Dataframe with rows x columns
    """
    csv_path = os.path.join(housing_path, "housing.csv")

    return pd.read_csv(csv_path)

In [5]:
fetch_housing_data(housing_url=HOUSING_URL, housing_path= HOUSING_PATH)
housing = load_housing_data(HOUSING_PATH)

train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

housing["income_cat"] = pd.cut(
    housing["median_income"],
    bins=[0.0, 1.5, 3.0, 4.5, 6.0, np.inf],
    labels=[1, 2, 3, 4, 5],
)

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

strat_train_set.to_csv(
    os.path.join(OUTPUT_PATH, "strat_train_set.csv"), index=False
)
strat_test_set.to_csv(os.path.join(OUTPUT_PATH, "strat_test_set.csv"), index=False)

train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

train_set.to_csv(os.path.join(OUTPUT_PATH, "train_set.csv"), index=False)
test_set.to_csv(os.path.join(OUTPUT_PATH, "test_set.csv"), index=False)


labels =  housing["median_house_value"]
housing = housing.drop("median_house_value", axis=1)

In [6]:
housing.isna().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
ocean_proximity         0
income_cat              0
dtype: int64

In [7]:
# the pre-processed data is already stored inside data/processed

In [7]:
housing_prepared = pd.read_csv("../data/processed/housing_prepared.csv")
housing_labels = pd.read_csv("../data/processed/housing_labels.csv")

print(housing_prepared.shape, housing_labels.shape)

(16512, 15) (16512, 1)


In [8]:
housing_prepared

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_household,bedrooms_per_room,population_per_household,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-121.46,38.52,29.0,3873.0,797.0,2237.0,706.0,2.1736,5.485836,0.205784,3.168555,1,0,0,0
1,-117.23,33.09,7.0,5320.0,855.0,2015.0,768.0,6.3373,6.927083,0.160714,2.623698,0,0,0,1
2,-119.04,35.37,44.0,1618.0,310.0,667.0,300.0,2.8750,5.393333,0.191595,2.223333,1,0,0,0
3,-117.13,32.75,24.0,1877.0,519.0,898.0,483.0,2.2264,3.886128,0.276505,1.859213,0,0,0,1
4,-118.70,34.28,27.0,3536.0,646.0,1837.0,580.0,4.4964,6.096552,0.182692,3.167241,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16507,-117.07,33.03,14.0,6665.0,1231.0,2026.0,1001.0,5.0900,6.658342,0.184696,2.023976,0,0,0,0
16508,-121.42,38.51,15.0,7901.0,1422.0,4769.0,1418.0,2.8139,5.571932,0.179977,3.363188,1,0,0,0
16509,-122.72,38.44,48.0,707.0,166.0,458.0,172.0,3.1797,4.110465,0.234795,2.662791,0,0,0,0
16510,-122.70,38.31,14.0,3155.0,580.0,1208.0,501.0,4.1964,6.297405,0.183835,2.411178,0,0,0,0


In [9]:
housing_labels = np.asarray(housing_labels).reshape(housing_labels.shape[0])

### Support Vector Regressor

In [10]:
estimator = sklearn.svm.SVR()

param_grid = [
        {'kernel': ['linear'], 'C': [1.0, 3.0, 10., 30]},
        {'kernel': ['rbf'], 'C': [1.0, 3.0, 10., 30],
         'gamma': [0.01, 0.03, 0.1, 0.3]},
    ]

In [11]:
search = sklearn.model_selection.GridSearchCV(estimator, param_grid, scoring='neg_mean_squared_error', n_jobs=None, refit=True, cv=2, 
                                              verbose=2, pre_dispatch='2*n_jobs', error_score=np.nan, return_train_score=True)
search.fit(housing_prepared, housing_labels)

Fitting 2 folds for each of 20 candidates, totalling 40 fits
[CV] END ...............................C=1.0, kernel=linear; total time=  11.1s
[CV] END ...............................C=1.0, kernel=linear; total time=  11.9s
[CV] END ...............................C=3.0, kernel=linear; total time=  26.5s
[CV] END ...............................C=3.0, kernel=linear; total time=  23.6s
[CV] END ..............................C=10.0, kernel=linear; total time= 1.0min
[CV] END ..............................C=10.0, kernel=linear; total time= 1.1min
[CV] END ................................C=30, kernel=linear; total time= 4.5min
[CV] END ................................C=30, kernel=linear; total time= 3.3min
[CV] END ......................C=1.0, gamma=0.01, kernel=rbf; total time=   6.0s
[CV] END ......................C=1.0, gamma=0.01, kernel=rbf; total time=   6.0s
[CV] END ......................C=1.0, gamma=0.03, kernel=rbf; total time=   5.8s
[CV] END ......................C=1.0, gamma=0.03

In [12]:
search.best_estimator_

### Randomized Search CV

In [13]:
random_search = sklearn.model_selection.RandomizedSearchCV(estimator, param_grid, n_iter=10, scoring='neg_mean_squared_error', n_jobs=None, 
                                           refit=True, cv=2, verbose=2, pre_dispatch='2*n_jobs', random_state=random_state, 
                                           error_score=np.nan, return_train_score=True)

random_search.fit(housing_prepared, housing_labels)

Fitting 2 folds for each of 10 candidates, totalling 20 fits
[CV] END .......................C=30, gamma=0.03, kernel=rbf; total time=   4.1s
[CV] END .......................C=30, gamma=0.03, kernel=rbf; total time=   4.2s
[CV] END ......................C=3.0, gamma=0.01, kernel=rbf; total time=   4.4s
[CV] END ......................C=3.0, gamma=0.01, kernel=rbf; total time=   4.4s
[CV] END ......................C=10.0, gamma=0.3, kernel=rbf; total time=   4.1s
[CV] END ......................C=10.0, gamma=0.3, kernel=rbf; total time=   4.1s
[CV] END ........................C=30, gamma=0.1, kernel=rbf; total time=   4.1s
[CV] END ........................C=30, gamma=0.1, kernel=rbf; total time=   3.9s
[CV] END ......................C=1.0, gamma=0.03, kernel=rbf; total time=   4.3s
[CV] END ......................C=1.0, gamma=0.03, kernel=rbf; total time=   4.3s
[CV] END ...............................C=1.0, kernel=linear; total time=   9.6s
[CV] END ...............................C=1.0, k

In [14]:
random_search.best_estimator_

In [15]:
# model = SVR(C=3.0, kernel='linear')
# model.fit(housing_prepared, housing_labels)

In [16]:
# housing_

In [17]:
housing__ = housing.copy()
housing__.drop("ocean_proximity", inplace=True, axis=1)
column_names = housing__.copy().columns

In [18]:
SI = SimpleImputer(strategy="mean")
housing__ = SI.fit_transform(housing__)

In [19]:
rfr = RandomForestRegressor(random_state= random_state)
rfr.fit(housing__, labels)
feature_importances = rfr.feature_importances_

In [20]:
def indices_of_top_k(arr, k):
    return np.sort(np.argpartition(np.array(arr), -k)[-k:])

class TopFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, feature_importances, k):
        self.feature_importances = feature_importances
        self.k = k
    def fit(self, X, y=None):
        self.feature_indices_ = indices_of_top_k(self.feature_importances, self.k)
        return self
    def transform(self, X):
        if type(X) == pd.DataFrame:
            return X.iloc[:, self.feature_indices_]     
        else:
            return X[:, self.feature_indices_]     

In [21]:
k = 6
top_k_feature_indices = indices_of_top_k(feature_importances, k)
top_k_feature_indices

array([0, 1, 2, 4, 5, 7])

In [22]:
sorted(zip(feature_importances, column_names.to_list()), reverse=True)

[(0.5125263649773646, 'median_income'),
 (0.16398902857643427, 'longitude'),
 (0.15513341213575313, 'latitude'),
 (0.06377647537349658, 'housing_median_age'),
 (0.03387534325970874, 'population'),
 (0.02527431079973757, 'total_bedrooms'),
 (0.02433755920827869, 'total_rooms'),
 (0.02016329367583948, 'households'),
 (0.0009242119933869711, 'income_cat')]

Try adding a transformer in the preparation pipeline to select only the most important attributes.

In [23]:
k = 6
preparation_and_feature_selection_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="mean")),
    ('feature_selection', TopFeatureSelector(feature_importances, k))
])

In [24]:
housing_prepared_top_k_features = preparation_and_feature_selection_pipeline.fit(housing__)

In [25]:
housing_prepared_top_k_features.transform(housing__)

array([[-1.2223e+02,  3.7880e+01,  4.1000e+01,  1.2900e+02,  3.2200e+02,
         8.3252e+00],
       [-1.2222e+02,  3.7860e+01,  2.1000e+01,  1.1060e+03,  2.4010e+03,
         8.3014e+00],
       [-1.2224e+02,  3.7850e+01,  5.2000e+01,  1.9000e+02,  4.9600e+02,
         7.2574e+00],
       ...,
       [-1.2122e+02,  3.9430e+01,  1.7000e+01,  4.8500e+02,  1.0070e+03,
         1.7000e+00],
       [-1.2132e+02,  3.9430e+01,  1.8000e+01,  4.0900e+02,  7.4100e+02,
         1.8672e+00],
       [-1.2124e+02,  3.9370e+01,  1.6000e+01,  6.1600e+02,  1.3870e+03,
         2.3886e+00]])

In [26]:
len(housing.columns)

10

In [27]:
housing_prepared_top_k_features

2nd Pipeline

In [28]:
housing_ = housing.copy()

housing_["rooms_per_household"] = (
    housing_["total_rooms"] / housing_["households"]
)
housing_["bedrooms_per_room"] = (
    housing_["total_bedrooms"] / housing_["total_rooms"]
)
housing_["population_per_household"] = (
    housing_["population"] / housing_["households"]
)

housing_ = housing_.join(pd.get_dummies(housing[["ocean_proximity"]], drop_first=True))
housing_.drop("ocean_proximity", inplace=True, axis=1)



In [29]:
SI = SimpleImputer(strategy="mean")
housing_ = SI.fit_transform(housing_)

In [30]:
rfr = RandomForestRegressor(random_state= random_state)
rfr.fit(housing_, labels)
feature_importances = rfr.feature_importances_

In [31]:
feature_importances

array([6.09947927e-02, 5.74575556e-02, 4.32611864e-02, 1.25015583e-02,
       1.19138601e-02, 1.14673664e-02, 1.05600835e-02, 4.74095613e-01,
       5.79357146e-04, 2.62099817e-02, 2.48643198e-02, 1.20366526e-01,
       1.41930612e-01, 1.50743929e-04, 6.98718263e-04, 2.94772516e-03])

Try creating a single pipeline that does the full data preparation plus the final prediction.

Refactor your ML code to use sklearn pipeline. Create a custom transformer for the new features (i.e. rooms_per_household, bedrooms_per_room, population_per_household) generated in your code

In [32]:
class CustomTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, features, labels = None):
        self.features = features
        self.labels = labels
        
    def fit(self, X, y=None):
        return self  # nothing else to do
    
    def transform(self, features):
        features["rooms_per_household"] = (
            features["total_rooms"] / features["households"]
        )
        features["bedrooms_per_room"] = (
            features["total_bedrooms"] / features["total_rooms"]
        )
        features["population_per_household"] = (
            features["population"] / features["households"]
        )

        if "ocean_proximity" in features.columns:
            features = features.join(pd.get_dummies(features[["ocean_proximity"]], drop_first=True))
            features.drop("ocean_proximity", inplace=True, axis=1)

        return features

In [33]:
k = 6
feature_eng = Pipeline([
    ('column_tranformation', CustomTransformer(housing, labels)),
    ('imputer', SimpleImputer(strategy="median")),
    ('feature_selection', TopFeatureSelector(feature_importances, k)),
])
model_new = Pipeline([
        ("feature", feature_eng),
        ('svm_reg', SVR(kernel= 'linear', C= 3.0)),
])


In [34]:
model_new.fit(housing, labels)

In [35]:
model_new.predict(housing)

array([336303.81526647, 320327.60732188, 313084.04237098, ...,
        84687.23120002,  92019.83559623, 104294.45324015])

In [36]:
param_grid = [{
    'feature__imputer__strategy': ['mean', 'median', 'most_frequent'],
    'feature__feature_selection__k': list(range(1, len(feature_importances) + 1))
}]

grid_search_prep = GridSearchCV(model_new, param_grid, cv=5,
                                scoring='neg_mean_squared_error', verbose=2)
grid_search_prep.fit(housing, labels)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV] END feature__feature_selection__k=1, feature__imputer__strategy=mean; total time=   4.7s
[CV] END feature__feature_selection__k=1, feature__imputer__strategy=mean; total time=   4.5s
[CV] END feature__feature_selection__k=1, feature__imputer__strategy=mean; total time=   5.1s
[CV] END feature__feature_selection__k=1, feature__imputer__strategy=mean; total time=   5.0s
[CV] END feature__feature_selection__k=1, feature__imputer__strategy=mean; total time=   5.1s
[CV] END feature__feature_selection__k=1, feature__imputer__strategy=median; total time=   5.0s
[CV] END feature__feature_selection__k=1, feature__imputer__strategy=median; total time=   5.3s
[CV] END feature__feature_selection__k=1, feature__imputer__strategy=median; total time=   5.1s
[CV] END feature__feature_selection__k=1, feature__imputer__strategy=median; total time=   5.2s
[CV] END feature__feature_selection__k=1, feature__imputer__strategy=median; total t

In [37]:
dir(grid_search_prep)

['__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_check_feature_names',
 '_check_n_features',
 '_check_refit_for_multimetric',
 '_estimator_type',
 '_format_results',
 '_get_param_names',
 '_get_tags',
 '_more_tags',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_required_parameters',
 '_run_search',
 '_select_best_index',
 '_validate_data',
 '_validate_params',
 'best_estimator_',
 'best_index_',
 'best_params_',
 'best_score_',
 'classes_',
 'cv',
 'cv_results_',
 'decision_function',
 'error_score',
 'estimator',
 'fit',
 'get_params',
 'inverse_transform',
 'multimetric_',
 'n_fe

In [38]:
grid_search_prep.best_params_

{'feature__feature_selection__k': 1, 'feature__imputer__strategy': 'mean'}