### Model evaluation for linear regression
Model metrics with sklearn for linear regression on the auto-mpg dataset

***
#### Environment
`conda activate sklearn-env`

***
#### Goals
- Build a pipeline
- Use the pipeline to transform data
- Use the pipeline to predict
- Compute various metrics specific to regression models

***
#### References

https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html

https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html

https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html

https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html

https://scikit-learn.org/stable/modules/model_evaluation.html


In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import random

# Make numpy printouts easier to read.
np.set_printoptions(precision=3, suppress=True)

#### Dataset load from CSV located on UCI website.

http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data  
If the URL does not work the dataset can be loaded from the data folder `./data/auto-mpg.data`.

In [2]:
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
column_names = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight',
                'Acceleration', 'Model Year', 'Origin']

raw_dataset = pd.read_csv(url, names=column_names,
                          na_values='?', comment='\t',
                          sep=' ', skipinitialspace=True)
dataset = raw_dataset.copy()
dataset.sample(5)

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
228,18.5,6,250.0,98.0,3525.0,19.0,77,1
58,25.0,4,97.5,80.0,2126.0,17.0,72,1
162,15.0,6,258.0,110.0,3730.0,19.0,75,1
24,21.0,6,199.0,90.0,2648.0,15.0,70,1
153,18.0,6,250.0,105.0,3459.0,16.0,75,1


### Dataset split
- row base in test and train datasets
- column base in features and labels

In [3]:
train_dataset = dataset.sample(frac=0.8, random_state=random.randint(0, 1000))
test_dataset = dataset.drop(train_dataset.index)

train_features = train_dataset.copy()
test_features = test_dataset.copy()

train_labels = train_features.pop('MPG')
test_labels = test_features.pop('MPG')

test_labels.reset_index(drop=True, inplace=True)

#### Build the pipeline

In [4]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.pipeline import Pipeline

numeric_features = ['Cylinders', 'Displacement', 'Horsepower', 'Weight', 'Acceleration', 'Model Year']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['Origin']
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

data_prep_pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

#### Use the pipeline to transform data

In [5]:
data_prep_data = data_prep_pipeline.fit_transform(train_features, train_labels)

transformed_df = pd.DataFrame(data = data_prep_data)
transformed_df.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0.344029,0.568093,-0.413884,0.214554,-0.402578,-1.36966,1.0,0.0,0.0
1,-0.845115,-1.019814,-1.020405,-1.010501,0.835384,1.105067,0.0,0.0,1.0
2,0.344029,0.083975,-0.413884,0.120502,0.544099,1.380036,1.0,0.0,0.0
3,1.533174,1.071575,0.931011,0.727672,-1.021559,0.555127,1.0,0.0,0.0
4,1.533174,1.226493,1.221086,1.4658,-0.948738,0.005188,1.0,0.0,0.0


#### Build the full pipeline

In [6]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', LinearRegression())])
pipeline_model = pipeline.fit(train_features, train_labels)
print(pipeline_model)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['Cylinders', 'Displacement',
                                                   'Horsepower', 'Weight',
                                                   'Acceleration',
                                                   'Model Year']),
                                                 ('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['Origin'])])),
                ('regressor', LinearRegression())])


In [7]:
from sklearn import set_config

set_config(display='diagram')
pipeline

#### Use pipeline to predict values

In [8]:
scored_data = pipeline.predict(test_features)

#### Show predicted values

In [9]:
scored_df = pd.DataFrame(data = scored_data)

scored_df = pd.concat([scored_df, test_labels], axis=1)
scored_df.head(5)

Unnamed: 0,0,MPG
0,14.72883,18.0
1,14.643889,17.0
2,21.536621,14.0
3,18.487941,22.0
4,18.969361,18.0


### Model evaluation

#### Mean squared error

https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_error.html    
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_absolute_error.html  
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_log_error.html#sklearn.metrics.mean_squared_log_error  

In [10]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_squared_log_error, mean_squared_log_error

def printMetrics(test_labels, scored_data):
    print( "RMSE:", mean_squared_error(test_labels, scored_data))
    print( "MAE: ", mean_absolute_error(test_labels, scored_data))
    print( "MSLE:", mean_squared_log_error(test_labels, scored_data))
    print( "RMSLE:", np.sqrt(mean_squared_log_error(test_labels, scored_data)))

In [11]:
printMetrics(test_labels, scored_data)

RMSE: 10.904181474440874
MAE:  2.6474773908768863
MSLE: 0.023185600631875913
RMSLE: 0.1522681865390007


### Build basic model without feature engineering and compare metrics

In [12]:
train_features_new = train_dataset[['Weight','Acceleration']].copy()
test_features_new = test_dataset[['Weight','Acceleration']].copy()

train_labels_new = train_labels.copy()
test_labels_new = test_labels.copy()

In [13]:
from sklearn.linear_model import LinearRegression

linear_regressor_new = LinearRegression().fit(train_features_new, train_labels_new)
scored_data_new = linear_regressor_new.predict(test_features_new)

In [14]:
scored_df = pd.DataFrame(data = scored_data_new)

scored_df = pd.concat([scored_df, test_labels_new], axis=1)
scored_df.head(5)

Unnamed: 0,0,MPG
0,18.508252,18.0
1,18.467751,17.0
2,20.936307,14.0
3,24.359014,22.0
4,24.783892,18.0


In [15]:
printMetrics(test_labels_new, scored_data_new)

RMSE: 18.759450456007006
MAE:  3.344734192206181
MSLE: 0.03314089402390515
RMSLE: 0.18204640623727003
