# Find-Tune Your Model
- github colab : https://homl.info/colab3

In [15]:
# 지금까지 한 것의 총합 : Pipeline을 사용해서 처리

import matplotlib
import matplotlib.pyplot
import numpy
import pathlib
import pandas
import scipy
import sklearn
import sklearn.base
import sklearn.compose
import sklearn.cluster
import sklearn.ensemble
import sklearn.impute
import sklearn.linear_model
import sklearn.model_selection
import sklearn.metrics.pairwise
import sklearn.pipeline
import sklearn.preprocessing
import sklearn.tree
import sklearn.utils.validation
import tarfile
import urllib

def ch2_load_housing_data():
    tarball_path = pathlib.Path("datasets/housing.tgz")
    if not tarball_path.is_file():
        pathlib.Path("datasets").mkdir(parents=True, exist_ok=True)
        url = "https://github.com/ageron/data/raw/main/housing.tgz"
        urllib.request.urlretrieve(url, tarball_path)
        with tarfile.open(tarball_path) as housing_tarball:
            housing_tarball.extractall(path="datasets")
    return pandas.read_csv(pathlib.Path("datasets/housing/housing.csv"))

def matplotlib_to_imagefile(output_dir, filename, imgext="png", tight_layout=True, resolution=300):
    path = output_dir / f"{filename}.{imgext}"
    if tight_layout:
        matplotlib.pyplot.tight_layout()
    matplotlib.pyplot.savefig(path, format=imgext, dpi=resolution)
    
def stratified_sampling_income_category(input_dataframe):
    input_dataframe = input_dataframe.copy()
    input_dataframe["income_cat"] = pandas.cut(input_dataframe["median_income"],
                               bins=[0., 1.5, 3.0, 4.5, 6., numpy.inf],
                               labels=[1, 2, 3, 4, 5])
    s_train, s_test = sklearn.model_selection.train_test_split(input_dataframe, test_size = 0.2, stratify = input_dataframe['income_cat'], random_state = 42)
    s_train.drop('income_cat', axis=1, inplace=True)
    s_test.drop('income_cat', axis=1, inplace=True)
    
    return s_train, s_test
    
# 저장할 디렉토리 설정
output_dir = pathlib.Path() / "images" / "end_to_end_project"
output_dir.mkdir(parents=True, exist_ok=True)
print(f'output_dir : {output_dir}')

input_dataframe = ch2_load_housing_data()
train, test = stratified_sampling_income_category(input_dataframe)

# train set에서 label과 predictor를 분리한다
label = train['median_house_value'].copy()
predictor = train.drop('median_house_value', axis = 1)

# Proprocessing용 pipeline을 준비
num_pipeline = sklearn.pipeline.make_pipeline(
    sklearn.impute.SimpleImputer(strategy='median'), 
    sklearn.preprocessing.StandardScaler())

cat_pipeline = sklearn.pipeline.make_pipeline(
    sklearn.impute.SimpleImputer(strategy='most_frequent'),
    sklearn.preprocessing.OneHotEncoder(handle_unknown='ignore'))

def column_ratio(X):
    return X[:,[0]] / X[:,[1]]

#
# 여기에서 왜 length하나짜리 list를 반환하는지 꽤 헷갈렸다
# 아래 실행한걸 보면 알수 있듯이 ColumnTransformer를 쓰면 feature의 앞부분에 일단 
# name이 알아서 앞에 붙는다
# 그래서 name__ratio 이런식으로 나오게 하는거다
#
def ratio_name(function_transformer, feature_names_in):
    return ['ratio']

def ratio_pipeline():
    return sklearn.pipeline.make_pipeline(
        sklearn.impute.SimpleImputer(strategy='median'),
        sklearn.preprocessing.FunctionTransformer(column_ratio, feature_names_out=ratio_name),
        # sklearn.preprocessing.FunctionTransformer(numpy.log, feature_names_out='one-to-one'),
        sklearn.preprocessing.StandardScaler(),
    )

class ClusterSimilarity(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin):
    def __init__(self, n_clusters=10, gamma=1.0, random_state=None):
        self.n_clusters = n_clusters
        self.gamma = gamma
        self.random_state = random_state

    def fit(self, X, y=None, sample_weight=None):
        self.kmeans_ = sklearn.cluster.KMeans(self.n_clusters, n_init='auto', random_state=self.random_state)
        self.kmeans_.fit(X, sample_weight=sample_weight)
        return self  # always return self!

    def transform(self, X):
        return sklearn.metrics.pairwise.rbf_kernel(X, self.kmeans_.cluster_centers_, gamma=self.gamma)
    
    def get_feature_names_out(self, names=None):
        return [f"Cluster {i} similarity" for i in range(self.n_clusters)]

#
# numpy.log는 AxB matrix를 받아서 AxB matrix를 반환한다
# 즉 아래에서 5 column 짜리 matrix를 넣으면 5 column 짜리 matrix를 반환한다
# 그래서 각각 이름을 붙이려면 length가 5인 array를 리턴하는 함수(위의 ratio_name 스타일인데 5개짜리 array를 리턴하는)
# 를 넣어주던지 아니면 그냥 이름을 그대로 쓰는 one-to-one을 쓰던지 하면 된다
#
log_pipeline = sklearn.pipeline.make_pipeline(
    sklearn.impute.SimpleImputer(strategy='median'),
    sklearn.preprocessing.FunctionTransformer(numpy.log, feature_names_out='one-to-one'),
    sklearn.preprocessing.StandardScaler(),
)

cluster_simil = ClusterSimilarity(n_clusters=10, gamma=1.0, random_state=42)

default_num_pipeline = sklearn.pipeline.make_pipeline(
    sklearn.impute.SimpleImputer(strategy='median'),
    sklearn.preprocessing.StandardScaler()
)

preprocessing = sklearn.compose.ColumnTransformer([
    # column_ratio는 Nx2 matrix를 받아서 Nx1 matrix를 반환한다
    ('bedrooms', ratio_pipeline(), ['total_bedrooms', 'total_rooms']),
    ('rooms_per_house', ratio_pipeline(), ['total_rooms', 'households']),
    ('people_per_house', ratio_pipeline(), ['population', 'households']),
    # numpy.log는 AxB matrix를 받아서 AxB matrix를 반환한다
    # 즉 아래에서 5 column 짜리 matrix를 넣으면 5 column 짜리 matrix를 반환한다
    ('log', log_pipeline, ['total_bedrooms', 'total_rooms', 'population','households','median_income']),
    ('geo', cluster_simil, ['latitude','longitude']),
    ('cat', cat_pipeline, sklearn.compose.make_column_selector(dtype_include=object)),
],
    remainder=default_num_pipeline) # one column remaining : housing_median_age

predictor_prepared = preprocessing.fit_transform(predictor)

print(predictor_prepared.shape)

preprocessing.get_feature_names_out()

output_dir : images\end_to_end_project
(16512, 24)


array(['bedrooms__ratio', 'rooms_per_house__ratio',
       'people_per_house__ratio', 'log__total_bedrooms',
       'log__total_rooms', 'log__population', 'log__households',
       'log__median_income', 'geo__Cluster 0 similarity',
       'geo__Cluster 1 similarity', 'geo__Cluster 2 similarity',
       'geo__Cluster 3 similarity', 'geo__Cluster 4 similarity',
       'geo__Cluster 5 similarity', 'geo__Cluster 6 similarity',
       'geo__Cluster 7 similarity', 'geo__Cluster 8 similarity',
       'geo__Cluster 9 similarity', 'cat__ocean_proximity_<1H OCEAN',
       'cat__ocean_proximity_INLAND', 'cat__ocean_proximity_ISLAND',
       'cat__ocean_proximity_NEAR BAY', 'cat__ocean_proximity_NEAR OCEAN',
       'remainder__housing_median_age'], dtype=object)

### Grid Search
- Hyperparameter 찾을때 그걸 일일히 손으로 돌리거나 짜서 돌리기 보다는 Scikit-Learn의 GridSearchCV를 이용하는게 편리하다

In [11]:
full_pipeline = sklearn.pipeline.Pipeline([
    ('preprocessing', preprocessing),
    ('random_forest', sklearn.ensemble.RandomForestRegressor(random_state=42))
])

# double underbar ('__') 를 .처럼 생각하면 됨
# 즉 preprocessing__geo__n_clusters 는 preprocessing.geo.n_clusters를 5,8,10 으로 해서 각각 돌려보겠다
# random_forest__max_features 는 random_forest.max_features를 4,6,8 으로 해서 각각 돌려보겠다 라는 의미

param_grid = [
    {'preprocessing__geo__n_clusters' : [5,8,10],
     'random_forest__max_features' : [4,6,8]},
     {'preprocessing__geo__n_clusters' : [10,15],
      'random_forest__max_features' : [6,8,10]}
]
# n_jobs=-1 : 모든 cpu를 사용한다
grid_search = sklearn.model_selection.GridSearchCV(full_pipeline, param_grid, cv=3, scoring='neg_root_mean_squared_error', n_jobs=-1)
grid_search.fit(predictor, label)


In [12]:
print(grid_search.best_params_)
print(grid_search.best_estimator_)

{'preprocessing__geo__n_clusters': 15, 'random_forest__max_features': 6}
Pipeline(steps=[('preprocessing',
                 ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer',
                                                              SimpleImputer(strategy='median')),
                                                             ('standardscaler',
                                                              StandardScaler())]),
                                   transformers=[('bedrooms',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('functiontransformer',
                                                                   FunctionTransformer(feature_names_out=<function ratio_name at 0x000001E...
                                                  ClusterSimilarity(n_clus

In [13]:
cv_res = pandas.DataFrame(grid_search.cv_results_)
cv_res.sort_values(by='mean_test_score', ascending=False, inplace=True)

# extra code – these few lines of code just make the DataFrame look nicer
cv_res = cv_res[["param_preprocessing__geo__n_clusters",
                 "param_random_forest__max_features", "split0_test_score",
                 "split1_test_score", "split2_test_score", "mean_test_score"]]
score_cols = ["split0", "split1", "split2", "mean_test_rmse"]
cv_res.columns = ["n_clusters", "max_features"] + score_cols  # column을 rename하는 것일 뿐임
cv_res[score_cols] = -cv_res[score_cols].round().astype(numpy.int64)

cv_res.head()

Unnamed: 0,n_clusters,max_features,split0,split1,split2,mean_test_rmse
12,15,6,43412,43898,44821,44043
13,15,8,43953,44136,44881,44323
14,15,10,44228,44535,45300,44688
7,10,6,44872,44917,46281,45357
9,10,6,44872,44917,46281,45357


### Randomized Search
- Grid Search는 Grid사이즈가 커지면 그만큼 계산 시간이 많이 들어간다
- 따라서 fit해야 하는 parameter가 많으면 무작정 grid사이즈를 키우는게 부담이 커진다
- Randomized search는 iteration수를 정해놓고 그 안에서 가능한 random하게 sampling하는 거니까 (모델이 잘 맞는다는 전제 하에서) 꽤 효율적으로 트레이닝 할 수 있다

In [16]:
param_distribs = {
    'preprocessing__geo__n_clusters' : scipy.stats.randint(low=3, high=50),
    'random_forest__max_features' : scipy.stats.randint(low=2, high=20),
}

rnd_search = sklearn.model_selection.RandomizedSearchCV(
    full_pipeline, param_distributions=param_distribs, n_iter=10, cv=3, 
    scoring='neg_root_mean_squared_error', n_jobs=-1)

rnd_search.fit(predictor, label)

In [17]:
print(rnd_search.best_params_)
print(rnd_search.best_estimator_)

{'preprocessing__geo__n_clusters': 32, 'random_forest__max_features': 5}
Pipeline(steps=[('preprocessing',
                 ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer',
                                                              SimpleImputer(strategy='median')),
                                                             ('standardscaler',
                                                              StandardScaler())]),
                                   transformers=[('bedrooms',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('functiontransformer',
                                                                   FunctionTransformer(feature_names_out=<function ratio_name at 0x000001E...
                                                  ClusterSimilarity(n_clus

In [19]:
rnd_res = pandas.DataFrame(rnd_search.cv_results_)
rnd_res.sort_values(by='mean_test_score', ascending=False, inplace=True)

# extra code – these few lines of code just make the DataFrame look nicer
rnd_res = rnd_res[["param_preprocessing__geo__n_clusters",
                 "param_random_forest__max_features", "split0_test_score",
                 "split1_test_score", "split2_test_score", "mean_test_score"]]
score_cols = ["split0", "split1", "split2", "mean_test_rmse"]
rnd_res.columns = ["n_clusters", "max_features"] + score_cols  # column을 rename하는 것일 뿐임
rnd_res[score_cols] = -rnd_res[score_cols].round().astype(numpy.int64)

rnd_res.head()

Unnamed: 0,n_clusters,max_features,split0,split1,split2,mean_test_rmse
6,32,5,41688,42541,43383,42537
4,29,9,41934,42892,43381,42736
5,47,18,42322,43061,43217,42867
2,20,10,43505,43772,44543,43940
7,30,19,43707,44190,44598,44165


In [22]:
# RandomForesttRegressor는 feature importance를 보여준다
# importance가 너무 낮은건 아예 제외 시키는게 나을 수 있다

final_model = rnd_search.best_estimator_
feature_importances = final_model['random_forest'].feature_importances_
print(feature_importances.round(2))

# 나는 random factor를 굳이 강제하지 않아서 책과 다소 다름. 하지만 거의 비슷하게 나온다
sorted(zip(feature_importances, final_model['preprocessing'].get_feature_names_out()), reverse=True)


[0.07 0.05 0.04 0.01 0.01 0.01 0.01 0.17 0.02 0.02 0.03 0.01 0.   0.01
 0.02 0.02 0.01 0.02 0.01 0.01 0.01 0.02 0.02 0.01 0.02 0.03 0.03 0.02
 0.01 0.01 0.02 0.01 0.02 0.02 0.01 0.02 0.03 0.02 0.03 0.02 0.01 0.06
 0.   0.   0.   0.01]


[(0.16717391235145979, 'log__median_income'),
 (0.06571217380852316, 'bedrooms__ratio'),
 (0.06394826856643815, 'cat__ocean_proximity_INLAND'),
 (0.05360849784628502, 'rooms_per_house__ratio'),
 (0.04188884888509996, 'people_per_house__ratio'),
 (0.03271952684919902, 'geo__Cluster 2 similarity'),
 (0.03184391693091836, 'geo__Cluster 17 similarity'),
 (0.03063472921165011, 'geo__Cluster 30 similarity'),
 (0.027242738551176426, 'geo__Cluster 18 similarity'),
 (0.026756652456949158, 'geo__Cluster 28 similarity'),
 (0.022791792399109122, 'geo__Cluster 14 similarity'),
 (0.022436978884625328, 'geo__Cluster 9 similarity'),
 (0.02013251651149785, 'geo__Cluster 6 similarity'),
 (0.01945288560654437, 'geo__Cluster 1 similarity'),
 (0.019165238686489533, 'geo__Cluster 13 similarity'),
 (0.018252155120488767, 'geo__Cluster 29 similarity'),
 (0.017912999441521204, 'geo__Cluster 25 similarity'),
 (0.017611857301085623, 'geo__Cluster 19 similarity'),
 (0.016772625239303663, 'geo__Cluster 0 similarit

## Test Set에 돌려보기

In [None]:
# training에서 predictor와 label을 나누듯이
X_test = test.drop('median_house_value', axis=1)
y_test = test['median_house_value'].copy()

final_predictions = final_model.predict(X_test)
final_rmse = sklearn.metrics.mean_squared_error(y_test, final_predictions, squared=False)

## 그외

### HalvingRandomSearchCV, HalvingGridSearchCV

- 처음에는 resource를 적게 써서 training해서 간을 보고 점점 좋은 parameter를 찾아나가면서 full resource를 사용한다는 개념

### Ensemble Method

- model에 따라서 서로 에러가 나는 부분이 서로 다를 수 있음
- 그러니 여러 model을 써서 fit한 다음 이걸 combine해서 더 좋은 결과를 얻자 라는 approach가 있는 것