# Exercises

In [23]:
from pathlib import Path
import pandas as pd
import tarfile
import urllib.request
import numpy as np

from sklearn.model_selection import train_test_split


from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.compose import ColumnTransformer, make_column_selector, make_column_transformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_array, check_is_fitted
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, FunctionTransformer
from sklearn.pipeline import Pipeline

from sklearn import set_config

set_config(display='diagram')

In [5]:
def load_housing_data():
    tarball_path = Path("datasets/housing.tgz")
    if not tarball_path.is_file(): # If there is no such file in dataset folder
        Path("datasets").mkdir(parents=True, exist_ok=True) # Create path if it not exist
        url = "https://github.com/ageron/data/raw/main/housing.tgz" # Search URL
        urllib.request.urlretrieve(url, tarball_path) # Request download from url
        with tarfile.open(tarball_path) as housing_tarball: # Extract files in folder
            housing_tarball.extractall(path="datasets")
    return pd.read_csv(Path("datasets/housing/housing.csv")) # Read extracted file as pd

housing = load_housing_data()

In [7]:
housing["income_cat"] = pd.cut(housing["median_income"],
                               bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                               labels=[1, 2, 3, 4, 5])


train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

strat_train_set, strat_test_set = train_test_split(
    housing, test_size=0.2, stratify=housing["income_cat"], random_state=42)


for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)

housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

In [9]:
class ClusterSimilarity(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters=10, gamma=1.0, random_state=None):
        self.n_clusters = n_clusters
        self.gamma = gamma
        self.random_state = random_state

    def fit(self, X, y=None, sample_weight=None):
        self.kmeans_ = KMeans(self.n_clusters, n_init=10,
                              random_state=self.random_state)
        self.kmeans_.fit(X, sample_weight=sample_weight) # Dunno why the name is the same (sample_weight). not good thing. fix later.
        return self  # always return self!

    def transform(self, X):
        return rbf_kernel(X, self.kmeans_.cluster_centers_, gamma=self.gamma)
    
    def get_feature_names_out(self, names=None):
        return [f"Cluster {i} similarity" for i in range(self.n_clusters)]


num_pipeline = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())


num_attribs = ["longitude", "latitude", "housing_median_age", "total_rooms",
               "total_bedrooms", "population", "households", "median_income"]
cat_attribs = ["ocean_proximity"]

cat_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore"))

preprocessing = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", cat_pipeline, cat_attribs),
])


def column_ratio(X):
    return X[:, [0]] / X[:, [1]]

def ratio_name(function_transformer, feature_names_in):
    return ["ratio"]  # feature names out

def ratio_pipeline():
    return make_pipeline(
        SimpleImputer(strategy="median"),
        FunctionTransformer(column_ratio, feature_names_out=ratio_name),
        StandardScaler())

log_pipeline = make_pipeline(
    SimpleImputer(strategy="median"),
    FunctionTransformer(np.log, feature_names_out="one-to-one"),
    StandardScaler())

cluster_simil = ClusterSimilarity(n_clusters=10, gamma=1., random_state=42)

default_num_pipeline = make_pipeline(SimpleImputer(strategy="median"), 
                                     StandardScaler())

preprocessing = ColumnTransformer([
        ("bedrooms", ratio_pipeline(), ["total_bedrooms", "total_rooms"]),
        ("rooms_per_house", ratio_pipeline(), ["total_rooms", "households"]),
        ("people_per_house", ratio_pipeline(), ["population", "households"]),
        ("log", log_pipeline, ["total_bedrooms", "total_rooms", "population",
                               "households", "median_income"]),
        ("geo", cluster_simil, ["latitude", "longitude"]),
        ("cat", cat_pipeline, make_column_selector(dtype_include=object)),
    ],
    remainder=default_num_pipeline)  # one column remaining: housing_median_age

In [11]:
housing_prepared = preprocessing.fit_transform(housing)
#housing_prepared.shape
#preprocessing.get_feature_names_out()
print("DataFrame Processed Shape: {}\n".format(housing_prepared.shape))
print("DataFrame Processed Columns:\n {}".format(preprocessing.get_feature_names_out()))

DataFrame Processed Shape: (16512, 24)

DataFrame Processed Columns:
 ['bedrooms__ratio' 'rooms_per_house__ratio' 'people_per_house__ratio'
 'log__total_bedrooms' 'log__total_rooms' 'log__population'
 'log__households' 'log__median_income' 'geo__Cluster 0 similarity'
 'geo__Cluster 1 similarity' 'geo__Cluster 2 similarity'
 'geo__Cluster 3 similarity' 'geo__Cluster 4 similarity'
 'geo__Cluster 5 similarity' 'geo__Cluster 6 similarity'
 'geo__Cluster 7 similarity' 'geo__Cluster 8 similarity'
 'geo__Cluster 9 similarity' 'cat__ocean_proximity_<1H OCEAN'
 'cat__ocean_proximity_INLAND' 'cat__ocean_proximity_ISLAND'
 'cat__ocean_proximity_NEAR BAY' 'cat__ocean_proximity_NEAR OCEAN'
 'remainder__housing_median_age']


The following exercises are based on this chapter’s housing dataset:

### 1. Try a support vector machine regressor (sklearn.svm.SVR) with various hyperparameters, such as kernel="linear" (with various values for the C hyperparameter) or kernel="rbf" (with various values for the C and gamma hyperparameters). Note that support vector machines don’t scale well to large datasets, so you should probably train your model on just the first 5,000 instances of the training set and use only 3-fold cross-validation, or else it will take hours. Don’t worry about what the hyperparameters mean for now; we’ll discuss them in Chapter 5. How does the best SVR predictor perform?

In [35]:
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV

# SVR Pipeline using preprocessing

svr_pipeline = Pipeline([
    ("preprocessing", preprocessing),
    ("SVR", SVR()),
])

# Param Grid for SVR
svr_grid = [
    {'SVR__kernel': ['linear'],
     'SVR__C': [1.0, 7.0, 10.0, 70.0, 100.0, 700.0, 1000.0, 7000.0] # Regularization Parameter
    },

    {'SVR__kernel': ['rbf'],
     'SVR__C': [1.0, 7.0, 10.0, 70.0, 100.0, 700.0, 1000.0, 7000.0],
     'SVR__gamma': [1, 3, 5, 7, 11] # Kernel Coefficient
    }
]


grid_search_svr = GridSearchCV(svr_pipeline, svr_grid, cv=3,
                           scoring='neg_root_mean_squared_error')


grid_search_svr.fit(housing.iloc[:5000], housing_labels.iloc[:5000])

In [33]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR

param_grid = [
        {'svr__kernel': ['linear'], 'svr__C': [10., 30., 100., 300., 1000.,
                                               3000., 10000., 30000.0]},
        {'svr__kernel': ['rbf'], 'svr__C': [1.0, 3.0, 10., 30., 100., 300.,
                                            1000.0],
         'svr__gamma': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0]},
    ]

svr_pipeline = Pipeline([("preprocessing", preprocessing), ("svr", SVR())])

grid_search = GridSearchCV(svr_pipeline, param_grid, cv=3,
                           scoring='neg_root_mean_squared_error')
grid_search.fit(housing.iloc[:5000], housing_labels.iloc[:5000])

In [55]:
svr_grid_search_rmse = -grid_search.best_score_
svr_grid_search_rmse

69951.46878594102

In [51]:
cv_res_svr = pd.DataFrame(grid_search_svr.cv_results_)


cv_res_svr.sort_values(by="mean_test_score", ascending=False, inplace=True)

cv_res_svr = cv_res_svr[["param_SVR__kernel", 
                         "param_SVR__C", 
                         "param_SVR__gamma", 
                         "split0_test_score", 
                         "split1_test_score", 
                         "split2_test_score", 
                         "mean_test_score"]]

score_cols = ["split0", "split1", "split2", "mean_test_rmse"]
cv_res_svr.columns = ["kernel", "C", "gamma"] + score_cols
cv_res_svr[score_cols] = -cv_res_svr[score_cols].round().astype(np.int64)


cv_res_svr.head(10)

Unnamed: 0,kernel,C,gamma,split0,split1,split2,mean_test_rmse
7,linear,7000.0,,72744,70797,66905,70149
6,linear,1000.0,,73971,73578,69609,72386
5,linear,700.0,,74395,74273,70315,72994
4,linear,100.0,,80619,80829,76875,79441
3,linear,70.0,,83130,83312,79368,81937
43,rbf,7000.0,1.0,99338,101702,93990,98343
2,linear,10.0,,106087,107034,101910,105010
1,linear,7.0,,109596,110729,105408,108577
38,rbf,1000.0,1.0,115263,117616,110781,114553
33,rbf,700.0,1.0,116726,118968,112304,115999


In [53]:
svr_grid_search_rmse = -grid_search_svr.best_score_
svr_grid_search_rmse

70148.91030624749

### 2. Try replacing the GridSearchCV with a RandomizedSearchCV.

In [81]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, reciprocal, expon, loguniform


svr_pipeline = Pipeline([
    ("preprocessing", preprocessing),
    ("SVR", SVR()),
])


param_distribs_rs = {
    'SVR__kernel': ['linear', 'rbf'],
    'SVR__C': loguniform(30, 3000000),   #randint(low=3, high=500),
    'SVR__gamma': expon(scale = 2.0)     #randint(low=2, high=200)
}

rnd_search = RandomizedSearchCV(
    svr_pipeline, 
    param_distributions = param_distribs_rs, 
    n_iter=10, 
    cv=3,
    scoring='neg_root_mean_squared_error', 
    random_state=42)

rnd_search.fit(housing.iloc[:5000], housing_labels.iloc[:5000])

In [83]:
cv_res_svr_rnd = pd.DataFrame(rnd_search.cv_results_)
cv_res_svr_rnd.sort_values(by="mean_test_score", ascending=False, inplace=True)

cv_res_svr_rnd = cv_res_svr_rnd[[
    "param_SVR__kernel", "param_SVR__C", "param_SVR__gamma", 
    "split0_test_score", "split1_test_score", "split2_test_score", 
    "mean_test_score"]]

score_cols = ["split0", "split1", "split2", "mean_test_rmse"]
cv_res_svr_rnd.columns = ["kernel", "C", "gamma"] + score_cols
cv_res_svr_rnd[score_cols] = -cv_res_svr_rnd[score_cols].round().astype(np.int64)


cv_res_svr_rnd.head(10)

Unnamed: 0,kernel,C,gamma,split0,split1,split2,mean_test_rmse
8,rbf,34386.32,0.300469,59977,59218,56235,58477
5,rbf,1478716.0,0.001558,64910,62491,59754,62385
9,rbf,2217680.0,0.529941,66951,64801,64981,65577
0,linear,2237.803,6.020243,72973,72309,68538,71273
3,linear,1398.506,0.308324,73412,72974,69136,71841
1,rbf,237452.2,1.816894,79255,79386,73734,77458
7,linear,32.54264,0.046665,91037,91357,86863,89752
2,rbf,180.7565,0.119678,107745,109535,103273,106851
6,rbf,247.8243,0.725507,117806,120076,113523,117135
4,rbf,38.02277,7.007115,120443,122671,116283,119799


In [85]:
svr_rnd_search_rmse = -rnd_search.best_score_
svr_rnd_search_rmse

58476.76438412195

In [87]:
rnd_search.best_params_

{'SVR__C': 34386.32221027512,
 'SVR__gamma': 0.30046905745467734,
 'SVR__kernel': 'rbf'}

In [137]:
rnd_search.best_params_['SVR__C']

34386.32221027512

### 3. Try adding a SelectFromModel transformer in the preparation pipeline to select only the most important attributes.

In [172]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestRegressor

svr_pipeline_sfm = Pipeline([
    ("preprocessing", preprocessing),
    ("transform", SelectFromModel(
        estimator = RandomForestRegressor(random_state=42), 
        threshold=0.005)
    ),
    ("SVR", SVR(
        C = rnd_search.best_params_['SVR__C'],
        kernel = rnd_search.best_params_['SVR__kernel'],
        gamma = rnd_search.best_params_['SVR__gamma']
        
    )),
])

In [174]:
from sklearn.model_selection import cross_val_score

rnd_score_sfm = -cross_val_score(svr_pipeline_sfm, housing[:5000], housing_labels[:5000], scoring="neg_root_mean_squared_error", cv=3)

In [175]:
rnd_score_sfm

array([59998.14792408, 58990.53713241, 56393.84435922])

In [176]:
pd.Series(rnd_score_sfm).describe()

count        3.000000
mean     58460.843139
std       1859.619005
min      56393.844359
25%      57692.190746
50%      58990.537132
75%      59494.342528
max      59998.147924
dtype: float64

### 4. Try creating a custom transformer that trains a k-nearest neighbors regressor (sklearn.neighbors.KNeighborsRegressor) in its fit() method, and outputs the model’s predictions in its transform() method. 
### Then add this feature to the preprocessing pipeline, using latitude and longitude as the inputs to this transformer. This will add a feature in the model that corresponds to the housing median price of the nearest districts.

In [None]:
from sklearn.neighbors.KNeighborsRegressor import KNN

class Cluster_Similarity(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters=10, gamma=1.0, random_state=None):
        self.n_clusters = n_clusters
        self.gamma = gamma
        self.random_state = random_state

    def fit(self, X, y=None, sample_weight=None):
        self.kmeans_ = KMeans(self.n_clusters, n_init=10,
                              random_state=self.random_state)
        
        self.kmeans_.fit(X, sample_weight=sample_weight) # Dunno why the name is the same (sample_weight). not good thing. fix later.
        return self  # always return self!

    def transform(self, X):
        return rbf_kernel(X, self.kmeans_.cluster_centers_, gamma=self.gamma)
    
    def get_feature_names_out(self, names=None):
        return [f"Cluster {i} similarity" for i in range(self.n_clusters)]

In [None]:
from sklearn.neighbors.KNeighborsRegressor import KNN

class Knn_Reg(BaseEstimator, TransformerMixin):
    def __init__(self, n_neighbors=5, weights='uniform'):
        self.n_neighbors = n_neighbors
        self.weights = weights

    def fit(self, X, Y):
        self.knn_ = KNN(self.n_neighbors, self.weights)
        
        self.knn_.fit(X, Y) # Dunno why the name is the same (sample_weight). not good thing. fix later.
        return self  # always return self!

    def transform(self, X):
        return self.knn_.predict(X).reshape(-1, 1)
    
    
    

In [None]:
n_neighbors=5, *, weights='uniform'

### 5. Automatically explore some preparation options using GridSearchCV.

### 6. Try to implement the StandardScalerClone class again from scratch, then add support for the inverse_transform() method: executing scaler. inverse_transform(scaler.fit_transform(X)) should return an array very close to X. Then add support for feature names: set feature_names_in_ in the fit() method if the input is a DataFrame. This attribute should be a NumPy array of column names. Lastly, implement the get_feature_names_out() method: it should have one optional input_features=None argument. If passed, the method should check that its length matches n_features_in_, and it should match feature_names_in_ if it is defined; then input_features should be returned. If input_features is None, then the method should either return feature_names_in_ if it is defined or np.array(["x0", "x1", ...]) with length n_features_in_ otherwise.