### Exhaustive search over specified pipeline estimator parameters
HPO with sklearn for linear regression on the auto-mpg dataset

***
#### Environment
`conda activate sklearn-env`

***
#### Goals
***

- Build a pipeline
- Use the pipeline to transform data
- Use the pipeline to predict
- Gready search for best pipeline parameter pair using gread search cross validation technique

***
#### References

https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html

https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html

https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html

https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html

https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html


In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import random

# Make numpy printouts easier to read.
np.set_printoptions(precision=3, suppress=True)

#### Dataset load from CSV located on UCI website.

http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data  
If the URL does not work the dataset can be loaded from the data folder `./data/auto-mpg.data`.

In [2]:
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
column_names = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight',
                'Acceleration', 'Model Year', 'Origin']

raw_dataset = pd.read_csv(url, names=column_names,
                          na_values='?', comment='\t',
                          sep=' ', skipinitialspace=True)
dataset = raw_dataset.copy()
dataset.sample(5)

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
230,15.5,8,350.0,170.0,4165.0,11.4,77,1
357,32.9,4,119.0,100.0,2615.0,14.8,81,3
244,43.1,4,90.0,48.0,1985.0,21.5,78,2
121,15.0,8,318.0,150.0,3399.0,11.0,73,1
63,14.0,8,400.0,175.0,4385.0,12.0,72,1


### Dataset split
- row base in test and train datasets
- column base in features and labels

In [3]:
train_dataset = dataset.sample(frac=0.8, random_state=random.randint(0, 1000))
test_dataset = dataset.drop(train_dataset.index)

train_features = train_dataset.copy()
test_features = test_dataset.copy()

train_labels = train_features.pop('MPG')
test_labels = test_features.pop('MPG')

test_labels.reset_index(drop=True, inplace=True)

#### Build the pipeline

In [4]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.pipeline import Pipeline

numeric_features = ['Cylinders', 'Displacement', 'Horsepower', 'Weight', 'Acceleration', 'Model Year']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['Origin']
categorical_transformer = OneHotEncoder(handle_unknown='error')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

data_prep_pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

#### Use the pipeline to transform data

In [5]:
data_prep_data = data_prep_pipeline.fit_transform(train_features, train_labels)

transformed_df = pd.DataFrame(data = data_prep_data)
transformed_df.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,-0.861904,-0.991885,-1.150362,-1.378623,0.297752,0.572728,0.0,0.0,1.0
1,1.485723,1.950488,1.840024,1.758271,-1.449075,-1.313207,1.0,0.0,0.0
2,-0.861904,-0.92523,-1.020345,-0.695666,1.830682,1.380985,1.0,0.0,0.0
3,1.485723,1.950488,3.270208,1.539254,-2.162066,-0.774368,1.0,0.0,0.0
4,-0.861904,-0.525295,-0.396265,-0.254098,0.083855,0.303308,1.0,0.0,0.0


#### Build the full pipeline

In [6]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', ElasticNet(random_state=0))])
pipeline_model = pipeline.fit(train_features, train_labels)
print(pipeline_model)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['Cylinders', 'Displacement',
                                                   'Horsepower', 'Weight',
                                                   'Acceleration',
                                                   'Model Year']),
                                                 ('cat', OneHotEncoder(),
                                                  ['Origin'])])),
                ('regressor', ElasticNet(random_state=0))])


In [7]:
from sklearn import set_config

set_config(display='diagram')
pipeline

#### Use pipeline to predict values

In [8]:
scored_data = pipeline.predict(test_features)

#### Show predicted values

In [9]:
scored_df = pd.DataFrame(data = scored_data)

scored_df = pd.concat([scored_df, test_labels], axis=1)
scored_df.head(5)

Unnamed: 0,0,MPG
0,16.173961,16.0
1,13.360672,15.0
2,14.54993,15.0
3,12.889736,10.0
4,12.373267,9.0


### Model evaluation

#### Mean squared error

https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_error.html    
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_absolute_error.html  
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_log_error.html#sklearn.metrics.mean_squared_log_error  

In [10]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_log_error

def printMetrics(test_labels, scored_data):
    print( "RMSE:", mean_squared_error(test_labels, scored_data))
    print( "MAE: ", mean_absolute_error(test_labels, scored_data))
    print( "MSLE:", mean_squared_log_error(test_labels, scored_data))

In [11]:
printMetrics(test_labels, scored_data)

RMSE: 18.307138109819437
MAE:  3.2543371254539153
MSLE: 0.026774521632362887


In [12]:
pipeline.score(test_features, test_labels)

0.7104719939899797

In [13]:
from sklearn.model_selection import GridSearchCV


param_grid = {
    'preprocessor__num__imputer__strategy': ['mean', 'median'],
    'preprocessor__cat__drop': ['first', None],
    'regressor__tol': [0.1, 0.01, 0.001, 0.0001],
    'regressor__selection': ["cyclic", 'random'],
    'regressor__alpha': [1.0, 0.5, 0.1],
    'regressor__l1_ratio': [0.1, 0.5, 1.0],  
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5)
grid_search.fit(train_features, train_labels)

print(f"Best params:")
print(grid_search.best_params_)

print(f"Internal CV score: {grid_search.best_score_:.3f}")

grid_search

Best params:
{'preprocessor__cat__drop': None, 'preprocessor__num__imputer__strategy': 'mean', 'regressor__alpha': 0.1, 'regressor__l1_ratio': 1.0, 'regressor__selection': 'cyclic', 'regressor__tol': 0.01}
Internal CV score: 0.809
