In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_boston
import pickle
import time

# Data

In [3]:
data = load_boston()
X_train, X_test, y_train, y_test = train_test_split(data['data'], data['target'])

## hyperparam tuning libraries

In [5]:
from sklearn.preprocessing import StandardScaler, RobustScaler, QuantileTransformer
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge

### Pipeline components

In [6]:
scaler = StandardScaler()
pca = PCA()
ridge = Ridge()

In [12]:
X_train = scaler.fit_transform(X_train)
X_train

array([[ 0.66603934, -0.81263032,  0.24615936, ..., -0.21831035,
        -0.71285917, -0.61838153],
       [-0.85611071,  0.52162691,  0.1177767 , ..., -1.11301841,
         0.36523404, -0.09942835],
       [ 0.21513772,  1.14045386, -0.5175098 , ..., -2.01642436,
        -1.18605694, -0.34029554],
       ...,
       [-0.91874767, -0.13869132, -0.54592243, ..., -1.07086412,
         0.75301663, -0.17478847],
       [ 1.28164595, -0.4678556 , -0.33581692, ...,  0.00568857,
        -0.2803595 , -0.75318945],
       [ 1.60212666, -1.27209079, -0.25502214, ..., -1.00722695,
        -2.58266965, -1.01726047]])

In [13]:
X_train = pca.fit_transform(X_train)
X_train

array([[-0.75054808,  0.18321727,  0.57180042, ..., -0.97682665,
         0.66603934, -1.27419997],
       [ 0.04468248,  0.7666066 , -0.55934106, ...,  0.41210066,
        -0.85611071,  0.16669858],
       [ 1.78095664,  1.31765407, -0.95175562, ..., -2.14552174,
         0.21513772,  0.14983561],
       ...,
       [ 0.42462917,  1.39419359,  0.4223281 , ...,  1.16417192,
        -0.91874767,  0.34433432],
       [ 0.35673064,  1.2481798 ,  0.82630665, ...,  0.06096835,
         1.28164595, -0.0749759 ],
       [ 1.88178228,  2.80928898,  0.8356954 , ..., -0.8968215 ,
         1.60212666, -0.10830999]])

In [14]:
ridge.fit(X_train, y_train)

Ridge()

## Pipeline

In [15]:
from sklearn.pipeline import Pipeline
pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('reduce_dim', PCA()),
        ('regressor', Ridge())
        ])

In [17]:
pipe

Pipeline(steps=[('scaler', StandardScaler()), ('reduce_dim', PCA()),
                ('regressor', Ridge())])

In [18]:
pipe = pipe.fit(X_train, y_train)
pipe

Pipeline(steps=[('scaler', StandardScaler()), ('reduce_dim', PCA()),
                ('regressor', Ridge())])

In [19]:
print('Testing score: ', pipe.score(X_test, y_test))

Testing score:  -71362.66505410972


In [20]:
print(pipe.steps[1][1].explained_variance_)

[1.0026455 1.0026455 1.0026455 1.0026455 1.0026455 1.0026455 1.0026455
 1.0026455 1.0026455 1.0026455 1.0026455 1.0026455 1.0026455]


### Hyperparameter tuning

In [28]:
n_features_to_test = np.arange(1, 11)
n_features_to_test

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

In [29]:
alpha_to_test = 2.0**np.arange(-6, +6)
alpha_to_test

array([1.5625e-02, 3.1250e-02, 6.2500e-02, 1.2500e-01, 2.5000e-01,
       5.0000e-01, 1.0000e+00, 2.0000e+00, 4.0000e+00, 8.0000e+00,
       1.6000e+01, 3.2000e+01])

In [30]:
params = {'reduce_dim__n_components': n_features_to_test,\
              'regressor__alpha': alpha_to_test}

### GridSearch

In [32]:
pipe

Pipeline(steps=[('scaler', StandardScaler()), ('reduce_dim', PCA()),
                ('regressor', Ridge())])

In [31]:
from sklearn.model_selection import GridSearchCV
gridsearch = GridSearchCV(pipe, params, verbose=1).fit(X_train, y_train)
print('Final score is: ', gridsearch.score(X_test, y_test))

Fitting 5 folds for each of 120 candidates, totalling 600 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Final score is:  -65379.34597156049


[Parallel(n_jobs=1)]: Done 600 out of 600 | elapsed:    1.3s finished
