In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neighbors import LocalOutlierFactor
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.feature_selection import RFE
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import mean_squared_error 
from scipy.stats import randint

import warnings
warnings.filterwarnings('ignore')

# Preprocess Test
기본 Linear Regression 모델을 사용하여, 다양한 전처리 패키지의 적용 효과를 확인한다.

<br>

## Summary
전처리 프로세스를 적용하여 기본 선형회귀 모델의 테스트셋 MSE 향상
- **(전처리 미적용)0.419 -> (전처리 적용)0.392**

<br>

1. LOF(Local Outlier Factor)
    - 비지도 클러스터링 기반 아웃라이어 탐색
    - 가장 유의미한 효과를 확인하였음

2. PF (Polynomial Features)
    - 기존 11개 설명변수를 조합하여 67개의 새로운 설명변수 생성, 비선형 패턴 파악에 도움될 것으로 기대
    - 피쳐 셀렉션 및 차원축소 기법을 이어서 적용하면 추가적인 성능향상 기대 가능
3. RFE (Recursive Feature Elimination)
    - 주어진 모델로 데이터를 반복 학습하면서, 매 iteration마다 중요성 가장 낮은 설명변수를 제거
4. 차원축소 기법 (PCA, TSNE, LDA)
    - 대표적인 차원축소기법인 PCA만 테스트 진행

> **아웃라이어 제거 -> 변수 생성 -> 피쳐 셀렉션 -> 차원축소** 적용 결과:

> 전처리 미적용 테스트셋 MSE 0.419 -> 0.392 까지 기본 Linear Regression 모델의 성능 향상 확인.

> Feature Selection 갯수, 축소할 차원 수 등의 파라미터를 RandomizedSearchCV(StratifiedKFold=5)로 탐색,

> Best setting(pipeline parameter)은 아래와 같음.

```python
# Scaled 된 학습 데이터에 Local Outlier Factor 적용한 데이터로 학습

{'dimension_reduce': PCA(copy=True, iterated_power='auto', n_components=22, random_state=None,
     svd_solver='auto', tol=0.0, whiten=False),
 'dimension_reduce__n_components': 20,
 'feature_selection': RFE(estimator=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
                                normalize=False),
     n_features_to_select=43, step=1, verbose=0),
 'feature_selection__n_features_to_select': 52,
 'poly': PolynomialFeatures(degree=2, include_bias=True, interaction_only=False,
                    order='C'),
 'poly__degree': 2,
 'regressor': LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False),
 'scale': MinMaxScaler(copy=True, feature_range=(0, 1))}
```

<br>

## Conclusion
- 하이퍼파라미터 튜닝을 연산량 적은 모델에 대하여 아래와 같이 진행
    - (예시) Linear Regression (Basic), SVM Regressor (Kernel Trick), ...
- Local Outlier Factor로 아웃라이어 제거한 학습 데이터셋 사용
- `RandomizedSearchCV`를 이용한 파라미터 튜닝
    - `LOF -> PF -> RFE -> PCA` 전처리 파이프라인의 세부 파라미터 튜닝
    - 학습 모델 파라미터 튜닝 (학습방식이 다른 이하 4개 모델)

        



In [2]:
# load data
red = pd.read_csv('./data/winequality-red.csv', sep=';')
red = red.drop_duplicates(keep='last', ignore_index=True)

In [3]:
def split_data(df):
    X = df.drop('quality',axis=1)
    y = df['quality']
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=0.2,
                                                        stratify=y,
                                                        shuffle=True,
                                                        random_state=2021)
    return X_train, X_test, y_train, y_test

In [4]:
X_train, X_test, y_train, y_test = split_data(red)

X_scaler, y_scaler = StandardScaler(), StandardScaler()
X_train_std = X_scaler.fit_transform(X_train)
X_test_std = X_scaler.transform(X_test)
y_train_std = y_scaler.fit_transform(np.array(y_train).reshape(-1,1))
y_test_std = y_scaler.transform(np.array(y_test).reshape(-1,1))

In [5]:
corr_red = red.corr()
top6_corr_vs_quality = np.abs(corr_red['quality']).sort_values(ascending=False)[1:7]
top6_features = top6_corr_vs_quality.index.tolist()
# top3_features = top6_features[:3]

top6_features

['alcohol',
 'volatile acidity',
 'sulphates',
 'citric acid',
 'density',
 'total sulfur dioxide']

In [6]:
red_top6_features = red[top6_features + ['quality']]

# Local Outlier Factor
- The number of neighbors considered (parameter n_neighbors) is typically set 
    - 1) greater than the minimum number of samples a cluster has to contain, so that other samples can be local outliers relative to this cluster
    - 2) smaller than the maximum number of close by samples that can potentially be local outliers. 
- In practice, such informations are generally not available, and taking n_neighbors=20 appears to work well in general.

In [7]:
lof = LocalOutlierFactor(n_neighbors=20)
y_pred = lof.fit_predict(X_train)
outlier_idx = np.where(y_pred==-1)[0]
print(len(outlier_idx))
print(outlier_idx)

25
[  37   42   43   50   90  114  127  258  274  359  386  404  420  424
  440  484  491  535  572  606  685  736  895  908 1071]


In [8]:
red.iloc[outlier_idx]['quality'].value_counts()

5    15
6     6
7     3
4     1
Name: quality, dtype: int64

In [9]:
# scale 적용시 아웃라이어 판정 결과가 달라짐.
lof = LocalOutlierFactor(n_neighbors=20)
y_pred_std = lof.fit_predict(X_train_std)
outlier_idx2 = np.where(y_pred_std==-1)[0]
print(len(outlier_idx2))
print(outlier_idx2)

47
[  28   42   50   90  116  165  180  185  238  273  274  281  379  384
  385  386  404  420  440  456  461  468  491  527  532  535  573  595
  606  620  621  653  708  725  732  736  738  744  753  807  825  839
  947  955  984  988 1067]


In [10]:
red.iloc[outlier_idx2]['quality'].value_counts()

5    20
6    18
7     8
4     1
Name: quality, dtype: int64

## Vanilla vs. Local Outlier Factor
Default Linear Regression Model
- Vanilla MSE = 0.427
- LOF MSE = 0.401
- Scaled LOF MSE = 0.378

In [11]:
def lr_train_mse(Xtr, Xte, ytr, yte, X_scaler=None, y_scaler=None):
    lr = LinearRegression()
    if X_scaler:
        xs = X_scaler
        Xtr = xs.fit_transform(Xtr)
        Xte = xs.transform(Xte)
    if y_scaler:
        ys = y_scaler
        ytr = ys.fit_transform(np.array(ytr).reshape(-1,1))
    
    # model train
    lr.fit(Xtr, ytr)
    
    # return MSE
    if not y_scaler:
        return np.mean(np.square(lr.predict(Xte) - yte))
    else:
        return np.mean(np.square(ys.inverse_transform(lr.predict(Xte).reshape(-1)) - yte))

In [12]:
X_train_lof, X_test_lof, y_train_lof, y_test_lof = split_data(red.drop(outlier_idx))
X_train_lof2, X_test_lof2, y_train_lof2, y_test_lof2 = split_data(red.drop(outlier_idx2))

In [13]:
lr_train_mse(X_train, X_test, y_train, y_test)

0.4190976461233711

In [14]:
lr_train_mse(X_train_lof, X_test_lof, y_train_lof, y_test_lof)

0.4249937427430061

In [15]:
# LOF 적용 후 스케일러 적용시 성능 변화 없음
lr_train_mse(X_train_lof, X_test_lof, y_train_lof, y_test_lof,
            StandardScaler(), StandardScaler())

0.4249937427430065

In [16]:
# 스케일러 적용 후 LOF 적용 시 소폭 성능 개선
lr_train_mse(X_train_lof2, X_test_lof2, y_train_lof2, y_test_lof2)

0.41579735690611885

# Polynomial Features

In [17]:
pf1 = PolynomialFeatures(degree=2)
X_train_pf_d2 = pf1.fit_transform(X_train)
X_test_pf_d2 = pf1.transform(X_test)

pf2 = PolynomialFeatures(degree=2)
X_train_std_pf_d2 = pf2.fit_transform(X_train_std)
X_test_std_pf_d2 = pf2.transform(X_test_std)

pf3 = PolynomialFeatures(degree=2)
X_train_lof_pf_d2 = pf3.fit_transform(X_train_lof)
X_test_lof_pf_d2 = pf3.transform(X_test_lof)

pf4 = PolynomialFeatures(degree=2)
X_train_lof2_pf_d2 = pf4.fit_transform(X_train_lof2)
X_test_lof2_pf_d2 = pf4.transform(X_test_lof2)

print(X_train.shape, X_train_pf_d2.shape, X_train_lof_pf_d2.shape, X_train_lof2_pf_d2.shape)

(1087, 11) (1087, 78) (1067, 78) (1049, 78)


In [18]:
# Vanilla vs. Polynomial Features (degree=2)
print(lr_train_mse(X_train, X_test, y_train, y_test))
print(lr_train_mse(X_train_std_pf_d2, X_test_std_pf_d2, y_train, y_test))
print(lr_train_mse(X_train_lof_pf_d2, X_test_lof_pf_d2, y_train_lof, y_test_lof))
print(lr_train_mse(X_train_lof2_pf_d2, X_test_lof2_pf_d2, y_train_lof2, y_test_lof2))

0.4190976461233711
0.41909986484819195
0.41000529727710927
1.389261663826348


# RFE (Recursive Feature Elimination)

In [19]:
lr = LinearRegression()
rfe = RFE(lr)
X_train_pf_d2_rfe = rfe.fit_transform(X_train_pf_d2, y_train)
X_test_pf_d2_rfe = rfe.transform(X_test_pf_d2)

lr_train_mse(X_train_pf_d2_rfe, X_test_pf_d2_rfe, y_train, y_test)

0.41978661636686554

In [20]:
rfe.ranking_

array([38,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, 10, 27, 17, 15,  1,
       25, 35,  1,  5,  1, 11,  1,  1, 20,  3, 34, 21,  1,  1, 13,  8,  1,
        7,  1, 23, 26,  1,  1, 16, 19, 18,  1, 37, 33,  1,  6, 12, 32,  1,
        4, 31,  1,  1,  1,  1, 36, 39,  1, 22, 14, 28, 40,  1, 24, 29, 30,
        1,  1,  1,  1,  1,  2,  1,  1,  1,  9])

In [32]:
lr = LinearRegression()
rfe = RFE(lr, 25)
X_train_pf_d2_rfe = rfe.fit_transform(X_train_pf_d2, y_train)
X_test_pf_d2_rfe = rfe.transform(X_test_pf_d2)

lr_train_mse(X_train_pf_d2_rfe, X_test_pf_d2_rfe, y_train, y_test)

0.39265701299785827

In [24]:
lr = LinearRegression()
rfe = RFE(lr, 25)
X_train_lof_pf_d2_rfe = rfe.fit_transform(X_train_lof_pf_d2, y_train_lof)
X_test_lof_pf_d2_rfe = rfe.transform(X_test_lof_pf_d2)

lr_train_mse(X_train_lof_pf_d2_rfe, X_test_lof_pf_d2_rfe, y_train_lof, y_test_lof)

0.39751526748651816

In [29]:
lr = LinearRegression()
rfe = RFE(lr, 20)
X_train_lof2_pf_d2_rfe = rfe.fit_transform(X_train_lof2_pf_d2, y_train_lof2)
X_test_lof2_pf_d2_rfe = rfe.transform(X_test_lof2_pf_d2)

lr_train_mse(X_train_lof2_pf_d2_rfe, X_test_lof2_pf_d2_rfe, y_train_lof2, y_test_lof2)

0.4496052647432139

# Dimenson Reduction
- PCA
    - https://data101.oopy.io/easy-understand-pca-lda
- TSNE
    - https://lovit.github.io/nlp/representation/2018/09/28/tsne/
- LDA
    - https://data101.oopy.io/easy-understand-pca-lda

### PCA

In [33]:
pca = PCA(n_components=10)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)
lr_train_mse(X_train_pca, X_test_pca, y_train, y_test)

0.4174369001330111

In [34]:
pca = PCA(n_components=30)
X_train_pf_d2_pca = pca.fit_transform(X_train_pf_d2)
X_test_pf_d2_pca = pca.transform(X_test_pf_d2)

lr_train_mse(X_train_pf_d2_pca, X_test_pf_d2_pca, y_train, y_test)

0.4047120613354118

In [35]:
pca = PCA(n_components=7)
X_train_pf_d2_rfe_pca = pca.fit_transform(X_train_pf_d2_rfe)
X_test_pf_d2_rfe_pca = pca.transform(X_test_pf_d2_rfe)

lr_train_mse(X_train_pf_d2_rfe_pca, X_test_pf_d2_rfe_pca, y_train, y_test)

0.41451787058889955

In [36]:
pca = PCA(n_components=15)
X_train_lof_pf_d2_rfe_pca = pca.fit_transform(X_train_lof_pf_d2_rfe)
X_test_lof_pf_d2_rfe_pca = pca.transform(X_test_lof_pf_d2_rfe)

lr_train_mse(X_train_lof_pf_d2_rfe_pca, X_test_lof_pf_d2_rfe_pca, y_train_lof, y_test_lof)

0.4031626711701112

In [37]:
pca = PCA(n_components=20)
X_train_lof2_pf_d2_rfe_pca = pca.fit_transform(X_train_lof2_pf_d2_rfe)
X_test_lof2_pf_d2_rfe_pca = pca.transform(X_test_lof2_pf_d2_rfe)

lr_train_mse(X_train_lof2_pf_d2_rfe_pca, X_test_lof2_pf_d2_rfe_pca, y_train_lof2, y_test_lof2)

0.44960526474344725

# Simple GridSearch for Preprocessing Pipeline

In [38]:
def grid_search_with_pipeline(Xtr, Xte, ytr, yte, pipe, params):
    grid = GridSearchCV(pipe, params,
                         scoring = 'neg_mean_squared_error',
                         cv=StratifiedKFold(n_splits=5,
                                              shuffle=True,
                                              random_state=2021),
                         verbose=1, n_jobs=-1)
    grid.fit(Xtr, ytr)
    print(grid.best_params_)
    print('Best CV MSE', -1 * grid.best_score_)
#     print('Mean CV MSE', -1 * grid.cv_results_['mean_test_score'])
    print('Test MSE', np.mean(np.square(grid.predict(Xte) - yte)))
    return grid

In [39]:
pipe = Pipeline([
                ('scale', MinMaxScaler()),
                ('regressor', LinearRegression())
                ])

param_grid = [              
              {'regressor': [LinearRegression()],
               'scale':[MinMaxScaler(), StandardScaler(), RobustScaler()],
              }
             ]

grd = grid_search_with_pipeline(X_train, X_test, y_train, y_test, pipe, param_grid)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


{'regressor': LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False), 'scale': StandardScaler(copy=True, with_mean=True, with_std=True)}
Best CV MSE 0.44863702716400455
Test MSE 0.41909764612337114


[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:    1.6s finished


In [40]:
pipe = Pipeline([
                ('scale', MinMaxScaler()),
                ('poly', PolynomialFeatures()),
                ('feature_selection', RFE(LinearRegression())),
                ('regressor', LinearRegression())
                ])

param_grid = [              
              {'regressor': [LinearRegression()],
               'scale':[MinMaxScaler(), StandardScaler(), RobustScaler()],
               'poly':[PolynomialFeatures()],
               'poly__degree':[2],
              'feature_selection' : [RFE(LinearRegression())],
                'feature_selection__n_features_to_select' : range(20, 50)
              }
             ]

grd = grid_search_with_pipeline(X_train, X_test, y_train, y_test, pipe, param_grid)

Fitting 5 folds for each of 90 candidates, totalling 450 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    7.9s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   17.5s
[Parallel(n_jobs=-1)]: Done 450 out of 450 | elapsed:   17.7s finished


{'feature_selection': RFE(estimator=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
                               normalize=False),
    n_features_to_select=39, step=1, verbose=0), 'feature_selection__n_features_to_select': 39, 'poly': PolynomialFeatures(degree=2, include_bias=True, interaction_only=False,
                   order='C'), 'poly__degree': 2, 'regressor': LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False), 'scale': RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
             with_scaling=True)}
Best CV MSE 0.44484501456046965
Test MSE 0.4263630438483234


### RandomizedSearchCV
- Best CV MSE 0.412
- Test MSE 0.404

In [42]:
def random_grid_search_with_pipeline(Xtr, Xte, ytr, yte, pipe, params, iter_num):
    r_grid = RandomizedSearchCV(pipe, params,
                         scoring = 'neg_mean_squared_error',
                         cv=StratifiedKFold(n_splits=5,
                                              shuffle=True,
                                              random_state=2021),
                         verbose=1, n_jobs=-1, n_iter=iter_num)
    r_grid.fit(Xtr, ytr)
    print(r_grid.best_params_)
    print('Best CV MSE', -1 * r_grid.best_score_)
    print('Test MSE', np.mean(np.square(r_grid.predict(Xte) - yte)))
    return r_grid

In [44]:
pipe = Pipeline([
                ('scale', MinMaxScaler()),
                ('poly', PolynomialFeatures()),
                ('feature_selection', RFE(LinearRegression())),
                ('dimension_reduce', PCA()),
                ('regressor', LinearRegression())
                ])

param_grid = [              
              {'regressor': [LinearRegression()],
               'scale':[MinMaxScaler(), StandardScaler(), RobustScaler()],
               'poly':[PolynomialFeatures()],
               'poly__degree':[2],
              'feature_selection' : [RFE(LinearRegression())],
              'feature_selection__n_features_to_select' : randint(low=20, high=60),
              'dimension_reduce' : [PCA()],
              'dimension_reduce__n_components' : randint(low=20, high=40)
              }
             ]

r_grd = random_grid_search_with_pipeline(X_train, X_test, y_train, y_test,
                                pipe, param_grid, iter_num=100)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    7.5s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   16.1s


{'dimension_reduce': PCA(copy=True, iterated_power='auto', n_components=30, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False), 'dimension_reduce__n_components': 30, 'feature_selection': RFE(estimator=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
                               normalize=False),
    n_features_to_select=38, step=1, verbose=0), 'feature_selection__n_features_to_select': 38, 'poly': PolynomialFeatures(degree=2, include_bias=True, interaction_only=False,
                   order='C'), 'poly__degree': 2, 'regressor': LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False), 'scale': RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
             with_scaling=True)}
Best CV MSE 0.44307943909701136
Test MSE 0.4246970235102481


[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:   18.1s finished


In [45]:
r_grd_lof = random_grid_search_with_pipeline(X_train_lof, X_test_lof, y_train_lof, y_test_lof,
                                pipe, param_grid, iter_num=100)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done 376 tasks      | elapsed:   13.5s


{'dimension_reduce': PCA(copy=True, iterated_power='auto', n_components=20, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False), 'dimension_reduce__n_components': 20, 'feature_selection': RFE(estimator=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
                               normalize=False),
    n_features_to_select=52, step=1, verbose=0), 'feature_selection__n_features_to_select': 52, 'poly': PolynomialFeatures(degree=2, include_bias=True, interaction_only=False,
                   order='C'), 'poly__degree': 2, 'regressor': LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False), 'scale': MinMaxScaler(copy=True, feature_range=(0, 1))}
Best CV MSE 0.44337678347123166
Test MSE 0.4256334483039431


[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:   18.1s finished


In [46]:
r_grd_lof2 = random_grid_search_with_pipeline(X_train_lof2, X_test_lof2, y_train_lof2, y_test_lof2,
                                pipe, param_grid, iter_num=100)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done 376 tasks      | elapsed:   13.5s


{'dimension_reduce': PCA(copy=True, iterated_power='auto', n_components=22, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False), 'dimension_reduce__n_components': 22, 'feature_selection': RFE(estimator=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
                               normalize=False),
    n_features_to_select=43, step=1, verbose=0), 'feature_selection__n_features_to_select': 43, 'poly': PolynomialFeatures(degree=2, include_bias=True, interaction_only=False,
                   order='C'), 'poly__degree': 2, 'regressor': LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False), 'scale': MinMaxScaler(copy=True, feature_range=(0, 1))}
Best CV MSE 0.4413861465377984
Test MSE 0.40729732323872686


[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:   17.2s finished


# Final Result
Model performance(MSE) using best model from RandomizedSearchCV
- 원본 데이터
    - MSE -> 0.404
- LocalOutlierFactor 데이터 (non-scaled 데이터에 적용)
    - MSE -> 0.393
- LocalOutlierFactor 데이터 (scaled 데이터에 적용)
    - MSE -> 0.375

In [47]:
def get_best_grid_model(grid_param_dict, Xtr, Xte, ytr, yte):
    
    def pipeline_application(pipeline, dataX, datay=None, test=False):
        result = dataX
        if not test:
            for pipe in pipeline:
                try:
                    result = pipe.fit_transform(result)
                except:
                    result = pipe.fit_transform(result, datay)
            return result, pipeline
        else:
            for pipe in pipeline:
                result = pipe.transform(result)
            return result

    scaler = grid_param_dict['scale']
    poly = grid_param_dict['poly']
    feature_selector = grid_param_dict['feature_selection']
    dim_reduce = grid_param_dict['dimension_reduce']
    model = grid_param_dict['regressor']
    pipeline = [scaler, poly, feature_selector, dim_reduce]

    Xtr_preprocess, pipeline = pipeline_application(pipeline, Xtr, datay=ytr)
    Xte_preprocess = pipeline_application(pipeline, Xte, test=True)
    
    model.fit(Xtr_preprocess, ytr)
    
    return model, np.mean(np.square(model.predict(Xte_preprocess) - yte))

In [48]:
get_best_grid_model(r_grd.best_params_, X_train, X_test, y_train, y_test)

(LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False),
 0.4262724695720125)

In [51]:
get_best_grid_model(r_grd_lof.best_params_,
                    X_train_lof, X_test_lof, y_train_lof, y_test_lof)

(LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False),
 0.392158043032267)

In [54]:
r_grd_lof.best_params_

{'dimension_reduce': PCA(copy=True, iterated_power='auto', n_components=22, random_state=None,
     svd_solver='auto', tol=0.0, whiten=False),
 'dimension_reduce__n_components': 20,
 'feature_selection': RFE(estimator=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
                                normalize=False),
     n_features_to_select=43, step=1, verbose=0),
 'feature_selection__n_features_to_select': 52,
 'poly': PolynomialFeatures(degree=2, include_bias=True, interaction_only=False,
                    order='C'),
 'poly__degree': 2,
 'regressor': LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False),
 'scale': MinMaxScaler(copy=True, feature_range=(0, 1))}

In [52]:
get_best_grid_model(r_grd_lof2.best_params_,
                    X_train_lof2, X_test_lof2, y_train_lof2, y_test_lof2)

(LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False),
 0.4072949307415827)

In [53]:
print(get_best_grid_model(r_grd_lof2.best_params_, X_train, X_test, y_train, y_test))
print(get_best_grid_model(r_grd_lof2.best_params_,
                    X_train_lof, X_test_lof, y_train_lof, y_test_lof))
print(get_best_grid_model(r_grd_lof2.best_params_,
                    X_train_lof2, X_test_lof2, y_train_lof2, y_test_lof2))

(LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False), 0.4028665761899813)
(LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False), 0.3921580262048367)
(LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False), 0.40730241230916975)
