### Model evaluation for linear regression
Model metrics with sklearn for linear regression on the auto-mpg dataset

***
#### Environment
`conda activate sklearn-env`

***
#### Goals
- Build a pipeline
- Use the pipeline to transform data
- Use the pipeline to predict
- Compute various metrics specific to regression models

***
#### References

https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html

https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html

https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html

https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html

https://scikit-learn.org/stable/modules/model_evaluation.html


In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import random

# Make numpy printouts easier to read.
np.set_printoptions(precision=3, suppress=True)

#### Dataset load from CSV located on UCI website.

http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data  
If the URL does not work the dataset can be loaded from the data folder `./data/auto-mpg.data`.

In [None]:
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
column_names = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight',
                'Acceleration', 'Model Year', 'Origin']

raw_dataset = pd.read_csv(url, names=column_names,
                          na_values='?', comment='\t',
                          sep=' ', skipinitialspace=True)
dataset = raw_dataset.copy()
dataset.sample(5)

### Dataset split
- row base in test and train datasets
- column base in features and labels

In [None]:
train_dataset = dataset.sample(frac=0.8, random_state=random.randint(0, 1000))
test_dataset = dataset.drop(train_dataset.index)

train_features = train_dataset.copy()
test_features = test_dataset.copy()

train_labels = train_features.pop('MPG')
test_labels = test_features.pop('MPG')

test_labels.reset_index(drop=True, inplace=True)

#### Build the pipeline

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.pipeline import Pipeline

numeric_features = ['Cylinders', 'Displacement', 'Horsepower', 'Weight', 'Acceleration', 'Model Year']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['Origin']
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

data_prep_pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

#### Use the pipeline to transform data

In [None]:
data_prep_data = data_prep_pipeline.fit_transform(train_features, train_labels)

transformed_df = pd.DataFrame(data = data_prep_data)
transformed_df.head(5)

#### Build the full pipeline

In [None]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', LinearRegression())])
pipeline_model = pipeline.fit(train_features, train_labels)
print(pipeline_model)

In [None]:
from sklearn import set_config

set_config(display='diagram')
pipeline

#### Use pipeline to predict values

In [None]:
scored_data = pipeline.predict(test_features)

#### Show predicted values

In [None]:
scored_df = pd.DataFrame(data = scored_data)

scored_df = pd.concat([scored_df, test_labels], axis=1)
scored_df.head(5)

### Model evaluation

#### Mean squared error

https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_error.html    
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_absolute_error.html  
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_log_error.html#sklearn.metrics.mean_squared_log_error  

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_squared_log_error, mean_squared_log_error

def printMetrics(test_labels, scored_data):
    print( "RMSE:", mean_squared_error(test_labels, scored_data))
    print( "MAE: ", mean_absolute_error(test_labels, scored_data))
    print( "MSLE:", mean_squared_log_error(test_labels, scored_data))
    print( "RMSLE:", np.sqrt(mean_squared_log_error(test_labels, scored_data)))

In [None]:
printMetrics(test_labels, scored_data)

### Build basic model without feature engineering and compare metrics

In [None]:
train_features_new = train_dataset[['Weight','Acceleration']].copy()
test_features_new = test_dataset[['Weight','Acceleration']].copy()

train_labels_new = train_labels.copy()
test_labels_new = test_labels.copy()

In [None]:
from sklearn.linear_model import LinearRegression

linear_regressor_new = LinearRegression().fit(train_features_new, train_labels_new)
scored_data_new = linear_regressor_new.predict(test_features_new)

In [None]:
scored_df = pd.DataFrame(data = scored_data_new)

scored_df = pd.concat([scored_df, test_labels_new], axis=1)
scored_df.head(5)

In [None]:
printMetrics(test_labels_new, scored_data_new)