In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_boston
import pickle
import time

# Data

In [2]:
data = load_boston()
X_train, X_test, y_train, y_test = train_test_split(data['data'], data['target'])

## hyperparam tuning libraries

In [3]:
from sklearn.preprocessing import StandardScaler, RobustScaler, QuantileTransformer
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge

### Pipeline components

In [4]:
scaler = StandardScaler()
pca = PCA()
ridge = Ridge()

In [5]:
X_train = scaler.fit_transform(X_train)
X_train

array([[-0.35020242,  0.42548259, -1.06733892, ..., -2.5393948 ,
         0.38267811, -0.79034907],
       [-0.41666333, -0.48033345, -1.37211174, ..., -1.19093175,
         0.36169277, -0.55313025],
       [-0.33612891, -0.48033345, -0.19844332, ..., -0.0284636 ,
         0.43562082, -0.91505839],
       ...,
       [-0.40493821, -0.48033345, -0.0973408 , ...,  0.06453385,
         0.43562082, -0.25220124],
       [-0.38252961, -0.48033345, -0.19844332, ..., -0.0284636 ,
         0.42859019, -0.37419949],
       [ 0.11626863, -0.48033345,  1.00306492, ...,  0.80851347,
         0.30981531,  0.61534185]])

In [6]:
X_train = pca.fit_transform(X_train)
X_train

array([[-1.14474580e+00,  2.05569221e+00,  9.99357518e-01, ...,
        -4.34319806e-02,  1.77732345e-01, -1.38310401e-01],
       [-1.89740698e+00,  2.43674362e-01, -4.90238186e-02, ...,
         8.27581910e-01,  2.02357710e-03,  9.34370987e-01],
       [-8.12001332e-01,  5.70342588e-01, -6.94000352e-01, ...,
        -3.08028084e-01, -6.34433390e-02,  2.23794246e-02],
       ...,
       [-1.25000009e+00, -1.84546430e-01, -1.16918900e+00, ...,
        -4.72339255e-01, -3.17421298e-01, -9.62911304e-02],
       [-4.21469007e-01,  6.81541759e-01, -9.69394116e-01, ...,
         1.72508139e-02,  1.81040576e-01, -1.37216989e-02],
       [ 2.75936206e+00, -8.44644902e-02,  2.46838602e-01, ...,
         9.50481234e-02, -3.11298686e-01, -8.47572307e-02]])

In [7]:
ridge.fit(X_train, y_train)

Ridge()

## Pipeline

In [8]:
from sklearn.pipeline import Pipeline
pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('reduce_dim', PCA()),
        ('regressor', Ridge())
        ])

In [9]:
pipe

Pipeline(steps=[('scaler', StandardScaler()), ('reduce_dim', PCA()),
                ('regressor', Ridge())])

In [10]:
pipe = pipe.fit(X_train, y_train)
pipe

Pipeline(steps=[('scaler', StandardScaler()), ('reduce_dim', PCA()),
                ('regressor', Ridge())])

In [11]:
print('Testing score: ', pipe.score(X_test, y_test))

Testing score:  -18318.560092604534


In [12]:
print(pipe.steps[1][1].explained_variance_)

[1.0026455 1.0026455 1.0026455 1.0026455 1.0026455 1.0026455 1.0026455
 1.0026455 1.0026455 1.0026455 1.0026455 1.0026455 1.0026455]


### Hyperparameter tuning

In [13]:
n_features_to_test = np.arange(1, 11)
n_features_to_test

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

In [14]:
alpha_to_test = 2.0**np.arange(-6, +6)
alpha_to_test

array([1.5625e-02, 3.1250e-02, 6.2500e-02, 1.2500e-01, 2.5000e-01,
       5.0000e-01, 1.0000e+00, 2.0000e+00, 4.0000e+00, 8.0000e+00,
       1.6000e+01, 3.2000e+01])

In [32]:
params = {'reduce_dim__n_components': n_features_to_test, 'regressor__alpha': alpha_to_test}

### GridSearch (BASE)

In [16]:
pipe

Pipeline(steps=[('scaler', StandardScaler()), ('reduce_dim', PCA()),
                ('regressor', Ridge())])

In [17]:
from sklearn.model_selection import GridSearchCV
gridsearch = GridSearchCV(pipe, params, verbose=1).fit(X_train, y_train)
print('Final score is: ', gridsearch.score(X_test, y_test))

Fitting 5 folds for each of 120 candidates, totalling 600 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Final score is:  -1332.1176773186367


[Parallel(n_jobs=1)]: Done 600 out of 600 | elapsed:    1.3s finished


In [18]:
gridsearch.best_params_

{'reduce_dim__n_components': 10, 'regressor__alpha': 0.015625}

### GridSearch (Advanced)

In [28]:
scalers_to_test = [StandardScaler(), RobustScaler(), QuantileTransformer()]

In [29]:
params = [
        {'scaler': scalers_to_test,
         'reduce_dim': [PCA()],
         'reduce_dim__n_components': n_features_to_test,\
         'regressor__alpha': alpha_to_test},

        {'scaler': scalers_to_test,
         'reduce_dim': [SelectKBest(f_regression)],
         'reduce_dim__k': n_features_to_test,\
         'regressor__alpha': alpha_to_test}
        ]

In [30]:
gridsearch = GridSearchCV(pipe, params).fit(X_train, y_train)
print('Final score is: ', gridsearch.score(X_test, y_test))





































































Final score is:  -17912.462459762046




In [31]:
gridsearch.best_params_

{'reduce_dim': SelectKBest(k=9, score_func=<function f_regression at 0x7ffaf065dca0>),
 'reduce_dim__k': 9,
 'regressor__alpha': 8.0,
 'scaler': StandardScaler()}