# 1
- Try a support vector machine regressor (sklearn.svm.SVR) with various hyperparameters, such as kernel="linear" (with various values for the
C hyperparameter) or kernel="rbf" (with various values for the C and gammahyperparameters). 
- Note that support vector machines don’t scale well to large datasets, so you should probably train your model on just the first 5,000 instances
of the training set and use only 3-fold cross-validation, or else it will take hours.
- Don’t worry about what the hyperparameters mean for now; we’ll discuss them in Chapter 5. How does the best SVR predictor perform?

In [5]:
# 좀 더 책 스타일로 복습

import pandas
import numpy

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cluster import KMeans
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.impute import SimpleImputer
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler

def stratified_sampling_income_category(idataframe):
    idataframe = idataframe.copy()
    idataframe['income_cat'] = pandas.cut(idataframe['median_income'], bins=[0., 1.5, 3.0, 4.5, 6., pandas.np.inf], labels=[1, 2, 3, 4, 5])
    s_train, s_test = train_test_split(idataframe, test_size=0.2, random_state=42, stratify=idataframe['income_cat'])
    s_train.drop('income_cat', axis=1, inplace=True)
    s_test.drop('income_cat', axis=1, inplace=True)
    return s_train, s_test

# load data as panda DataFrame
idataframe = pandas.read_csv('datasets/housing/housing.csv')

# Make stratified sampling of train and test set
train_set, test_set = stratified_sampling_income_category(idataframe)

# separate label and predictors
train_set_predictor = train_set.drop('median_house_value', axis=1)
train_set_labels = train_set['median_house_value'].copy()

# create a pipeline for preprocessing
num_pipeline = make_pipeline(SimpleImputer(strategy='median'), StandardScaler())
cat_pipeline = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder(handle_unknown='ignore'))

def column_ratio(X):
    return X[:,[0]] / X[:,[1]]

def ratio_name(function_transformer, feature_names_in):
    return ['ratio']

def ratio_pipeline():
    return make_pipeline( SimpleImputer(strategy='median'), FunctionTransformer(column_ratio, feature_names_out=ratio_name), StandardScaler() )

class ClusterSimilarity(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters=10, gamma=1.0, random_state=None):
        self.n_clusters = n_clusters
        self.gamma = gamma
        self.random_state = random_state

    def fit(self, X, y=None, sample_weight=None):
        self.kmeans_ = KMeans(self.n_clusters, n_init='auto', random_state=self.random_state)
        self.kmeans_.fit(X, sample_weight=sample_weight)
        return self  # always return self!

    def transform(self, X):
        return rbf_kernel(X, self.kmeans_.cluster_centers_, gamma=self.gamma)
    
    def get_feature_names_out(self, names=None):
        return [f"Cluster {i} similarity" for i in range(self.n_clusters)]

log_pipeline = make_pipeline( SimpleImputer(strategy='median'), FunctionTransformer(numpy.log, feature_names_out='one-to-one'), StandardScaler() )
cluster_simil = ClusterSimilarity(n_clusters=10, gamma=1.0)
default_num_pipeline = make_pipeline( SimpleImputer(strategy='median'), StandardScaler() )

preprocessing = ColumnTransformer([
    ('bedrooms', ratio_pipeline(), ['total_bedrooms', 'total_rooms']),
    ('rooms_per_house', ratio_pipeline(), ['total_rooms', 'households']),
    ('people_per_house', ratio_pipeline(), ['population', 'households']),
    ('log', log_pipeline, ['total_bedrooms', 'total_rooms', 'population','households','median_income']),
    ('geo', cluster_simil, ['latitude','longitude']),
    ('cat', cat_pipeline, make_column_selector(dtype_include=object)),
], remainder=default_num_pipeline) # one column remaining : housing_median_age

predictor_prepared = preprocessing.fit_transform(train_set_predictor)

print(predictor_prepared.shape)
print(preprocessing.get_feature_names_out())

(16512, 24)
['bedrooms__ratio' 'rooms_per_house__ratio' 'people_per_house__ratio'
 'log__total_bedrooms' 'log__total_rooms' 'log__population'
 'log__households' 'log__median_income' 'geo__Cluster 0 similarity'
 'geo__Cluster 1 similarity' 'geo__Cluster 2 similarity'
 'geo__Cluster 3 similarity' 'geo__Cluster 4 similarity'
 'geo__Cluster 5 similarity' 'geo__Cluster 6 similarity'
 'geo__Cluster 7 similarity' 'geo__Cluster 8 similarity'
 'geo__Cluster 9 similarity' 'cat__ocean_proximity_<1H OCEAN'
 'cat__ocean_proximity_INLAND' 'cat__ocean_proximity_ISLAND'
 'cat__ocean_proximity_NEAR BAY' 'cat__ocean_proximity_NEAR OCEAN'
 'remainder__housing_median_age']


  idataframe['income_cat'] = pandas.cut(idataframe['median_income'], bins=[0., 1.5, 3.0, 4.5, 6., pandas.np.inf], labels=[1, 2, 3, 4, 5])


In [3]:
# Support Vector Machine을 써 본다
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

svr_lin = make_pipeline(preprocessing, SVR(kernel='linear'))
svr_rbf = make_pipeline(preprocessing, SVR(kernel='rbf'))
# svr_reg = make_pipeline(preprocessing, SVR(kernel='rbf', gamma=0.1, C=1.0))
svr_lin.fit(train_set_predictor, train_set_labels)
svr_rbf.fit(train_set_predictor, train_set_labels)

svr_lin_prediction = svr_lin.predict(train_set_predictor)
svr_rbf_prediction = svr_rbf.predict(train_set_predictor)

svr_lin_rmse = mean_squared_error(train_set_labels, svr_lin_prediction, squared=False)
svr_rbf_rmse = mean_squared_error(train_set_labels, svr_rbf_prediction, squared=False)

# 둘 다 별 차이 없...
print(f"SVR Linear RMSE : {svr_lin_rmse}")
print(f"SVR RBF RMSE : {svr_rbf_rmse}")

SVR Linear RMSE : 111428.99585338461
SVR RBF RMSE : 118051.51502911921


In [11]:
# Grid Search 로 parameter를 찾아보자

from sklearn.model_selection import GridSearchCV

full_pipeline = make_pipeline(preprocessing, SVR())

param_grid = [
    {'svr__kernel': ['linear'], 'svr__C': [10., 30., 100., 300., 1000., 3000., 10000., 30000.]},
    {'svr__kernel': ['rbf'], 'svr__C': [1.0, 3.0, 10., 30., 100., 300., 1000.], 'svr__gamma': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0]},
]

grid_search = GridSearchCV(full_pipeline, param_grid, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1)
grid_search.fit(train_set_predictor, train_set_labels)

svr_grid_search_rmse = -grid_search.best_score_

print(grid_search.best_params_)
print(grid_search.best_estimator_)
print(svr_grid_search_rmse)  

# i9-12900H 에서 대략 6분 정도 걸렸고 그 결과 약 70000 정도의 RMSE를 얻었다. 
# random forest가 약 40000 정도를 얻은걸 생각하면 별 도움은 안되는 결과인 듯 하다
# 그나저나 이건 CPU 파워가 짱이군...


{'svr__C': 1000.0, 'svr__gamma': 0.1, 'svr__kernel': 'rbf'}
Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer',
                                                              SimpleImputer(strategy='median')),
                                                             ('standardscaler',
                                                              StandardScaler())]),
                                   transformers=[('bedrooms',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('functiontransformer',
                                                                   FunctionTransformer(feature_names_out=<function ratio_name at 0x000...
                                                   'total_rooms', 'population',
       

### 2. Try replacing the GridSearchCV with a RandomizedSearchCV.

In [7]:
# Randomized Search 로 parameter를 찾아보자
import scipy
from sklearn.model_selection import RandomizedSearchCV

full_pipeline = make_pipeline(preprocessing, SVR(kernel='linear'))

param_distribs = {
    'svr__C': scipy.stats.randint(low=10, high=30000),
}

rnd_search = RandomizedSearchCV(
    full_pipeline, param_distributions=param_distribs, n_iter=100, cv=3, 
    scoring='neg_root_mean_squared_error', n_jobs=-1)

rnd_search.fit(train_set_predictor, train_set_labels)
svr_rnd_search_rmse = -rnd_search.best_score_

print(svr_rnd_search_rmse)
print(rnd_search.best_params_)
print(rnd_search.best_estimator_)

71478.61642890856
{'svr__C': 1410}
Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer',
                                                              SimpleImputer(strategy='median')),
                                                             ('standardscaler',
                                                              StandardScaler())]),
                                   transformers=[('bedrooms',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('functiontransformer',
                                                                   FunctionTransformer(feature_names_out=<function ratio_name at 0x000...
                                                   'households',
                                               

In [11]:
import numpy

# kernel을 rbf로 바꿔서 다시 한번

full_pipeline = make_pipeline(preprocessing, SVR(kernel='rbf'))

param_distribs = {
    'svr__C': scipy.stats.randint(low=10, high=30000),
    'svr__gamma' : scipy.stats.uniform(loc=0.01, scale=3.0),
}

rnd_search = RandomizedSearchCV(
    full_pipeline, param_distributions=param_distribs, n_iter=100, cv=3, 
    scoring='neg_root_mean_squared_error', n_jobs=-1)

rnd_search.fit(train_set_predictor, train_set_labels)
svr_rnd_search_rmse = -rnd_search.best_score_

print(svr_rnd_search_rmse)
print(rnd_search.best_params_)
print(rnd_search.best_estimator_)

# 이걸로 해보니 RMSE가 57995 정도 나온다
# RBF가 Linear에 비해 좀 더 좋은 결과가 나오긴 한다
# 그래봐야 random forest에 비해서는 안좋았으니 의미는 없다만...

57995.34706633072
{'svr__C': 27866, 'svr__gamma': 0.4922022145550622}
Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer',
                                                              SimpleImputer(strategy='median')),
                                                             ('standardscaler',
                                                              StandardScaler())]),
                                   transformers=[('bedrooms',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('functiontransformer',
                                                                   FunctionTransformer(feature_names_out=<function ratio_name at 0x000...
                                                   'households',
            

### 3. Try adding a SelectFromModel transformer in the preparation pipeline to select only the most important attributes.

In [13]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

selector_pipeline = make_pipeline(
    preprocessing,
    SelectFromModel(RandomForestRegressor(), threshold=0.005),  # min feature importance
    SVR(C=27866, gamma=0.4922022145550622, kernel='rbf'))

selector_rmses = -cross_val_score(selector_pipeline, train_set_predictor, train_set_labels, cv=3, scoring='neg_root_mean_squared_error', n_jobs=-1)

pandas.Series(selector_rmses).describe()

# 근데 돌리고 보니 저 SelectFromModel은 preprocessing stage의 일부로 들어가야 하는거 아닌가...?
# 하지만ㄴ 귀찮으니 그냥 이런게 있다는 걸 기억하고 넘어가는 정도로....

count        3.000000
mean     57878.091756
std       1229.630564
min      57102.196804
25%      57169.219583
50%      57236.242363
75%      58266.039232
max      59295.836102
dtype: float64

### 4. Try creating a custom transformer that trains a k-nearest neighbors regressor (sklearn.neighbors.KNeighborsRegressor) in its fit() method, and outputs the model’s predictions in its transform() method. Then add this feature to the preprocessing pipeline, using latitude and longitude as the inputs to this transformer. This will add a feature in the model that corresponds to the housing median price of the nearest districts.

In [14]:

# KNeighborRegressor는 fit하고 predict는 있는데 tranfrom은 없다
# 그래서 굳이 이렇게 FeatureFromRegressor를 쓰는 것이다
# 아 그래서 transform에서 predictor를 부른거구나
# 해답을 보면 아무 regressor나 쓸 수 있게 만든다고 이렇게 했다는데 그게 처음엔 잘 이해가 안됐었는데 이제는 좀 이해가 됨
# 

from sklearn.utils.validation import check_is_fitted
from sklearn.neighbors import KNeighborsRegressor
from sklearn.base import MetaEstimatorMixin, clone

class FeatureFromRegressor(MetaEstimatorMixin, BaseEstimator, TransformerMixin):
    def __init__(self, estimator):
        self.estimator = estimator

    def fit(self, X, y=None):
        estimator_ = clone(self.estimator)
        estimator_.fit(X, y)
        self.estimator_ = estimator_
        self.n_features_in_ = self.estimator_.n_features_in_
        if hasattr(self.estimator, "feature_names_in_"):
            self.feature_names_in_ = self.estimator.feature_names_in_
        return self  # always return self!
    
    def transform(self, X):
        check_is_fitted(self)
        predictions = self.estimator_.predict(X)
        if predictions.ndim == 1:
            predictions = predictions.reshape(-1, 1)
        return predictions

    def get_feature_names_out(self, names=None):
        check_is_fitted(self)
        n_outputs = getattr(self.estimator_, "n_outputs_", 1)
        estimator_class_name = self.estimator_.__class__.__name__
        estimator_short_name = estimator_class_name.lower().replace("_", "")
        return [f"{estimator_short_name}_prediction_{i}"
                for i in range(n_outputs)]

In [15]:
from sklearn.utils.estimator_checks import check_estimator

check_estimator(FeatureFromRegressor(KNeighborsRegressor()))
knn_reg = KNeighborsRegressor(n_neighbors=3, weights="distance")
knn_transformer = FeatureFromRegressor(knn_reg)
geo_features = train_set_predictor[["latitude", "longitude"]]
knn_transformer.fit_transform(geo_features, train_set_labels)

array([[486100.66666667],
       [435250.        ],
       [105100.        ],
       ...,
       [148800.        ],
       [500001.        ],
       [234333.33333333]])

In [16]:
knn_transformer.get_feature_names_out()

['kneighborsregressor_prediction_0']

In [19]:
from sklearn.base import clone
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

transformers = [(name, clone(transformer), columns)
                for name, transformer, columns in preprocessing.transformers]
geo_index = [name for name, _, _ in transformers].index("geo")
transformers[geo_index] = ("geo", knn_transformer, ["latitude", "longitude"])

# KneighborsRegressor는 fit하고 predict밖에 업서여...
# transformers[geo_index] = ("geo", KNeighborsRegressor(n_neighbors=3, weights="distance"), ["latitude", "longitude"])

new_geo_preprocessing = ColumnTransformer(transformers)
new_geo_pipeline = Pipeline([
    ('preprocessing', new_geo_preprocessing),
    ('svr', SVR(C=27866, gamma=0.4922022145550622, kernel='rbf')),
])

new_pipe_rmses = -cross_val_score(new_geo_pipeline, train_set_predictor, train_set_labels, scoring="neg_root_mean_squared_error", cv=3)
pandas.Series(new_pipe_rmses).describe()

# 생각보다 RMSE는 안나옴. 100,000이나 나오네
# 근데 당연하다면 당연하다. 이럴거면 parameter도 다시 맞춰야지

count         3.000000
mean     104215.465769
std         515.524175
min      103664.301464
25%      103980.298351
50%      104296.295238
75%      104491.047921
max      104685.800604
dtype: float64

### 5. Automatically explore some preparation options using ~~GridSearchCV~~RandomSearchCV.
- RandomSearchCV가 GridSearchCV보다 하기 좋아서 이걸로 함
- 해답도 이걸로 되어 있음

In [22]:
from scipy.stats import expon, loguniform

param_distribs = {
    "preprocessing__geo__estimator__n_neighbors": range(1, 30),
    "preprocessing__geo__estimator__weights": ["distance", "uniform"],
    "svr__C": loguniform(20, 200_000),
    "svr__gamma": expon(scale=1.0),
}

new_geo_rnd_search = RandomizedSearchCV(new_geo_pipeline,
                                        param_distributions=param_distribs,
                                        n_iter=50,
                                        cv=3,
                                        scoring='neg_root_mean_squared_error',
                                        n_jobs=-1)
new_geo_rnd_search.fit(train_set_predictor, train_set_labels)
new_geo_rnd_search_rmse = -new_geo_rnd_search.best_score_

print(new_geo_rnd_search_rmse)
print(new_geo_rnd_search.best_params_)
print(new_geo_rnd_search.best_estimator_)

88847.05880192488
{'preprocessing__geo__estimator__n_neighbors': 1, 'preprocessing__geo__estimator__weights': 'distance', 'svr__C': 84510.15507675285, 'svr__gamma': 0.816499237057516}
Pipeline(steps=[('preprocessing',
                 ColumnTransformer(transformers=[('bedrooms',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('functiontransformer',
                                                                   FunctionTransformer(feature_names_out=<function ratio_name at 0x0000021B08E87040>,
                                                                                       func=<function column_ratio at 0x0000021B052428B0>)),
                                                                  ('standardscaler',
                                                                   S

### 6. Exercise: Try to implement the StandardScalerClone class again from scratch, then add support for the inverse_transform() method: executing scaler.inverse_transform(scaler.fit_transform(X)) should return an array very close to X. Then add support for feature names: set feature_names_in_ in the fit() method if the input is a DataFrame. This attribute should be a NumPy array of column names. Lastly, implement the get_feature_names_out() method: it should have one optional input_features=None argument. If passed, the method should check that its length matches n_features_in_, and it should match feature_names_in_ if it is defined, then input_features should be returned. If input_features is None, then the method should return feature_names_in_ if it is defined or np.array(["x0", "x1", ...]) with length n_features_in_ otherwise.

In [29]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_array, check_is_fitted

class StandardScalerClone(BaseEstimator, TransformerMixin):
    def __init__(self, with_mean=True):  # no *args or **kwargs!
        self.with_mean = with_mean

    def fit(self, X, y=None):  # y is required even though we don't use it
        X_orig = X
        X = check_array(X)  # checks that X is an array with finite float values
        self.mean_ = X.mean(axis=0)
        self.scale_ = X.std(axis=0)
        self.n_features_in_ = X.shape[1]  # every estimator stores this in fit()
        if hasattr(X_orig, "columns"):
            self.feature_names_in_ = numpy.array(X_orig.columns, dtype=object)
        return self  # always return self!

    def transform(self, X):
        check_is_fitted(self)  # looks for learned attributes (with trailing _)
        X = check_array(X)
        if self.n_features_in_ != X.shape[1]:
            raise ValueError("Unexpected number of features")
        if self.with_mean:
            X = X - self.mean_
        return X / self.scale_
    
    def inverse_transform(self, X):
        check_is_fitted(self)
        X = check_array(X)
        if self.n_features_in_ != X.shape[1]:
            raise ValueError("Unexpected number of features")
        X = X * self.scale_
        return X + self.mean_ if self.with_mean else X
    
    def get_feature_names_out(self, input_features=None):
        if input_features is None:
            return getattr(self, "feature_names_in_",
                           [f"x{i}" for i in range(self.n_features_in_)])
        else:
            if len(input_features) != self.n_features_in_:
                raise ValueError("Invalid number of features")
            if hasattr(self, "feature_names_in_") and not numpy.all(
                self.feature_names_in_ == input_features
            ):
                raise ValueError("input_features ≠ feature_names_in_")
            return input_features

In [24]:
# test our custom estimator
from sklearn.utils.estimator_checks import check_estimator
 
check_estimator(StandardScalerClone())

In [25]:
# ensure it works

# https://numpy.org/doc/stable/reference/generated/numpy.allclose.html
# numpy.allclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False)[source]
# Returns True if two arrays are element-wise equal within a tolerance.
# absolute(a - b) <= (atol + rtol * absolute(b))

X = numpy.random.rand(1000, 3)

scaler = StandardScalerClone()
X_scaled = scaler.fit_transform(X)

assert numpy.allclose(X_scaled, (X - X.mean(axis=0)) / X.std(axis=0))

In [26]:
# when with_mean = False

scaler = StandardScalerClone(with_mean=False)
X_scaled_uncentered = scaler.fit_transform(X)

assert numpy.allclose(X_scaled_uncentered, X / X.std(axis=0))

In [27]:
# inverse work?

scaler = StandardScalerClone()
X_back = scaler.inverse_transform(scaler.fit_transform(X))

assert numpy.allclose(X, X_back)

In [28]:
# How about the feature names out?

# https://numpy.org/doc/stable/reference/generated/numpy.all.html
# numpy.all : Test whether all array elements along a given axis evaluate to True.


assert numpy.all(scaler.get_feature_names_out() == ["x0", "x1", "x2"])
assert numpy.all(scaler.get_feature_names_out(["a", "b", "c"]) == ["a", "b", "c"])

In [30]:
# And if we fit a DataFrame, are the feature in and out ok?

df = pandas.DataFrame({"a": numpy.random.rand(100), "b": numpy.random.rand(100)})
scaler = StandardScalerClone()
X_scaled = scaler.fit_transform(df)

assert numpy.all(scaler.feature_names_in_ == ["a", "b"])
assert numpy.all(scaler.get_feature_names_out() == ["a", "b"])

NameError: name 'np' is not defined