### Pipeline for linear regression
Pipeline for linear regression with sklearn on the auto-mpg dataset

***
#### Environment
`conda activate sklearn-env`

***
#### Goals

- Build a pipeline
- Use the pipeline to transform data
- Use the pipeline to predict
- Save in various formats, load and use it to score

***
#### References

https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html

https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html

https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html

https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html

https://scikit-learn.org/stable/modules/model_persistence.html

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns


# Make numpy printouts easier to read.
np.set_printoptions(precision=3, suppress=True)

#### Dataset load from CSV located on UCI website.

http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data  
If the URL does not work the dataset can be loaded from the data folder `./data/auto-mpg.data`.

In [None]:
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
column_names = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight',
                'Acceleration', 'Model Year', 'Origin']

raw_dataset = pd.read_csv(url, names=column_names,
                          na_values='?', comment='\t',
                          sep=' ', skipinitialspace=True)
dataset = raw_dataset.copy()
dataset.sample(5)

### Dataset split
- row base in test and train datasets
- column base in features and labels

In [None]:
train_dataset = dataset.sample(frac=0.8, random_state=0)
test_dataset = dataset.drop(train_dataset.index)

train_features = train_dataset.copy()
test_features = test_dataset.copy()

train_labels = train_features.pop('MPG')
test_labels = test_features.pop('MPG')

test_labels.reset_index(drop=True, inplace=True)

#### Build the pipeline

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import Ridge
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures

numeric_features = ['Cylinders', 'Displacement', 'Horsepower', 'Weight', 'Acceleration', 'Model Year']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('poly', PolynomialFeatures(2)),
    ('scaler', StandardScaler())])

categorical_features = ['Origin']
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

#### Use the pipeline to transform data

In [None]:
pipeline_model = pipeline.fit_transform(train_features, train_labels)

transformed_df = pd.DataFrame(data = pipeline_model)
transformed_df.head(10)

#### Build the full pipeline

In [None]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', Ridge())])
pipeline_model = pipeline.fit(train_features, train_labels)
print(pipeline_model)

In [None]:
from sklearn import set_config

set_config(display='diagram')
pipeline

#### Use pipeline to predict values

In [None]:
scored_data = pipeline.predict(test_features)

#### Show predicted values

In [None]:
from sklearn.metrics import mean_squared_error

scored_df = pd.DataFrame(data = scored_data)
rmse = mean_squared_error(test_labels, scored_df)
print("RMSE is %f for %d records" % (rmse, len(test_features)))

scored_df = pd.concat([scored_df, test_labels], axis=1)
scored_df.rename(columns={0 : 'Predicted'}, inplace=True)

scored_df.head(10)

### Save then load model in pickle format

In [None]:
import pickle
s = pickle.dumps(pipeline)
pipeline_new = pickle.loads(s)
scored_test = pipeline_new.predict(test_features)
scored_df = pd.DataFrame(data = scored_test)
scored_df = pd.concat([scored_df, test_labels], axis=1)
scored_df.head(10)

### Alternative to python internal pickle format

In [None]:
from joblib import dump, load
    
dump(pipeline, 'mypipeline.joblib') 

pipeline2 = load('mypipeline.joblib') 

scored_test = pipeline2.predict(test_features)
scored_df = pd.DataFrame(data = scored_test)
scored_df = pd.concat([scored_df, test_labels], axis=1)
scored_df.head(10)