In [168]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from pandas.plotting import scatter_matrix
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from scipy import stats

housing_data = pd.read_csv('downloads/housing.csv')

train_set, test_set = train_test_split(housing_data, test_size = 0.2, random_state = 42)

housing_data['income_cat'] = pd.cut(housing_data['median_income'], 
                                     bins =[0., 1.5, 3.0, 4.5, 6., 
                                            np.inf],labels=[1,2,3,4,5])

split = StratifiedShuffleSplit(n_splits=1, test_size = 0.2, random_state=42)

for train_index, test_index in split.split(housing_data, housing_data['income_cat']):
    strat_train_set = housing_data.loc[train_index]
    strat_test_set = housing_data.loc[test_index]

for set_ in (strat_train_set, strat_test_set):
    set_.drop('income_cat', axis=1, inplace=True)

housing_data = strat_test_set.copy()

housing_data["rooms_per_household"] = housing_data["total_rooms"] / housing_data['households']
housing_data["bedrooms_per_room"] = housing_data["total_bedrooms"] / housing_data['total_rooms']
housing_data["population_per_household"] = housing_data["population"] / housing_data["households"]

housing_data = strat_train_set.drop('median_house_value', axis=1)
housing_labels = strat_train_set['median_house_value'].copy()

imputer = SimpleImputer(strategy='median')
housing_num =housing_data.drop("ocean_proximity", axis = 1)
imputer.fit(housing_num)

imputer.statistics_
housing_num.median().values

X = imputer.transform(housing_num)
housing_tr = pd.DataFrame(X, columns=housing_num.columns, index=housing_num.index)

housing_cat = housing_data[['ocean_proximity']]

cat_encoder = OneHotEncoder()
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
housing_cat_1hot.toarray()
cat_encoder.categories_

rooms_ix, bedrooms_ix, population_ix, households_ix = 3,4,5,6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True):  
        self.add_bedrooms_per_room = add_bedrooms_per_room  
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing_data.values)

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('attr_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])

housing_num_tr = num_pipeline.fit_transform(housing_num)

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat', OneHotEncoder(), cat_attribs)
])

housing_prepared = full_pipeline.fit_transform(housing_data)

cat_encoder = full_pipeline.named_transformers_["cat"]
cat_one_hot_attribs = list(cat_encoder.categories_[0])

extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room"]
attributes = num_attribs + extra_attribs + cat_one_hot_attribs

housing_prepared_df = pd.DataFrame(housing_prepared, columns=attributes)

X = housing_prepared_df
y = housing_labels

In [169]:
#Excercise 1: Try a Support Vector Machine regressor with various hyper parameters such as kernel = 'linear' / 'rbf'. How does the SVR predictor preform?
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

svr = SVR()

param_grid = [
    {'kernel': ['linear', 'rbf'], 'C': [20., 200., 2000., 20000.0]},
    {'kernel': ['rbf'], 'C': [2., 20., 200., 2000.0,],
    'gamma' : [0.02, 0.2, 2.0]}
]

svm_grid_search = GridSearchCV(svr, param_grid, cv=5, scoring = 'neg_mean_squared_error')
svm_grid_search.fit(X,y)

print(f"Best Estimator: {svm_grid_search.best_estimator_}")
print(f"Best Paramaters: {svm_grid_search.best_params_}")

svr_rbf = SVR(kernel='rbf', C=20000.0, gamma=0.2)
svr_rbf.fit(X,y)

svr_rbf_predictions = svr_rbf.predict(X)
svr_rbf_mse = mean_squared_error(y, svr_rbf_predictions)
svr_rbf_rmse = np.sqrt(svr_rbf_mse)
print(f"Best Paramaters RMSE Score: {svr_rbf_rmse}")
print("This model performs worse than the Random Forest Regressor")

Best Estimator: SVR(C=20000.0)
Best Paramaters: {'C': 20000.0, 'kernel': 'rbf'}
Best Paramaters RMSE Score: 56124.55127818887
This model performs worse than the Random Forest Regressor


In [170]:
#Excercise 2: Try replacing GridSearchCV with RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import reciprocal, expon

param_dist = {
    'kernel': ['linear', 'rbf'],
    'C': reciprocal(20,20000),
    'gamma' : expon(scale=1.0)
    }

svr = SVR()

svr_rand_search = RandomizedSearchCV(svr, param_distributions=param_dist, n_iter=25, cv=5, scoring = 'neg_mean_squared_error', random_state=42)
svr_rand_search.fit(X,y)

print(f"Best Estimator: {svr_rand_search.best_estimator_}")
print(f"Best Paramaters: {svr_rand_search.best_params_}")
svr_rbf_rand = SVR(kernel='rbf', C=16684.0, gamma=0.27)
svr_rbf_rand.fit(X,y)

svr_rbf_rand_predictions = svr_rbf_rand.predict(X)
svr_rbf_rand_mse = mean_squared_error(y, svr_rbf_rand_predictions)
svr_rbf_rand_rmse = np.sqrt(svr_rbf_rand_mse)
print(f"Best Paramaters RMSE Score: {svr_rbf_rand_rmse}")

Best Estimator: SVR(C=16683.860588281554, gamma=0.26497040005002437)
Best Paramaters: {'C': 16683.860588281554, 'gamma': 0.26497040005002437, 'kernel': 'rbf'}
Best Paramaters RMSE Score: 56254.321416577426


In [171]:
#Excercise 3: Try adding a transformer in the preparation pipeline to select only the most important attributes
from sklearn.ensemble import RandomForestRegressor
from sklearn.base import BaseEstimator, TransformerMixin

housing_prepared = full_pipeline.fit_transform(housing_data)

rfr = RandomForestRegressor(n_estimators=10, random_state=42)
rfr.fit(housing_prepared_df, housing_labels)
rfr_feature_importances = rfr.feature_importances_

def indices_of_top_k(arr, k):
    return np.sort(np.argpartition(np.array(arr), -k)[-k:])

class TopFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, feature_importances, k):
        self.feature_importances = feature_importances
        self.k = k
    
    def fit(self, X, y=None):
        self.feature_indices_ = indices_of_top_k(self.feature_importances, self.k)
        return self
    
    def transform(self, X):
        return X[:, self.feature_indices_]

k = 5
top_k_feature_indices = indices_of_top_k(rfr_feature_importances, k)

print(np.array(attributes)[top_k_feature_indices])

prep_feature_selection_pipeline = Pipeline([
    ('preparation', full_pipeline),
    ('feature_selection', TopFeatureSelector(rfr_feature_importances, k))
])

housing_prepared_top_k_features = prep_feature_selection_pipeline.fit_transform(housing_data)
housing_prepared[0:3, top_k_feature_indices]

['longitude' 'latitude' 'median_income' 'pop_per_hhold' 'INLAND']


array([[-0.94135046,  1.34743822, -0.8936472 ,  0.00622264,  1.        ],
       [ 1.17178212, -1.19243966,  1.292168  , -0.04081077,  0.        ],
       [ 0.26758118, -0.1259716 , -0.52543365, -0.07537122,  1.        ]])

In [172]:
#Excercise 4: Try creating a single pipeline that does the full data preperation plus the final prediction
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import reciprocal, expon

param_dist = {
    'kernel': ['linear', 'rbf'],
    'C': reciprocal(20,20000),
    'gamma' : expon(scale=1.0)
    }

svr_rand_search = RandomizedSearchCV(svr, param_distributions=param_dist, n_iter=25, cv=5, scoring = 'neg_mean_squared_error', random_state=42)
svr_rand_search.fit(X,y)

prep_select_predict_pipeline = Pipeline([
    ('preperation', full_pipeline),
    ('feature_selection', TopFeatureSelector(rfr_feature_importances, k)),
    ('rfr', SVR(**svr_rand_search.best_params_))
])

prep_select_predict_pipeline.fit(housing_data, housing_labels)

some_data = housing_data.iloc[:4]
some_labels = housing_labels.iloc[:4]

print("Predictions:", prep_select_predict_pipeline.predict(some_data))
print("Actual Prices:", list(some_labels))

Predictions: [ 86692.13281441 286840.7959524   99826.52170369 149805.01711305]
Actual Prices: [72100.0, 279600.0, 82700.0, 112500.0]
