### Feature engineering - Scalers
Standard Scaler with sklearn on the auto-mpg dataset

***
#### Environment
`conda activate sklearn-env`

***
#### Goals
***
- Replace continuous features with their scaled version

***
#### References

https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html  
https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html

#### Basic python imports

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import random 


# Make numpy printouts easier to read.
np.set_printoptions(precision=3, suppress=True)

#### Dataset load from CSV located on UCI website.

http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data  
If the URL does not work the dataset can be loaded from the data folder `./data/auto-mpg.data`.

In [None]:
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
column_names = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight',
                'Acceleration', 'Model Year', 'Origin']

raw_dataset = pd.read_csv(url, names=column_names,
                          na_values='?', comment='\t',
                          sep=' ', skipinitialspace=True)
dataset = raw_dataset.copy()
dataset.tail(2)

### Dataset split
- row base in test and train datasets
- column base in features and labels

In [None]:
dataset = dataset.dropna().copy()[['MPG', 'Weight', 'Displacement', 'Acceleration', 'Model Year', 'Origin']]
#[['MPG', 'Weight', 'Displacement', 'Acceleration', 'Model Year', 'Origin']]
#[['MPG', 'Displacement', 'Acceleration', 'Model Year', 'Origin']]
#[['MPG', 'Weight', 'Displacement', 'Model Year', 'Origin']]
#[['MPG', 'Weight', 'Acceleration', 'Model Year', 'Origin']]
#[['MPG', 'Weight']]

train_dataset = dataset.sample(frac=0.8, random_state=0)
test_dataset = dataset.drop(train_dataset.index)

train_features = train_dataset.copy()
test_features = test_dataset.copy()

train_labels = train_features.pop('MPG')
test_labels = test_features.pop('MPG')

In [None]:
from sklearn.preprocessing import StandardScaler
std_scaler = StandardScaler().fit(train_features)
std_train_features = std_scaler.transform(train_features)
std_test_features = std_scaler.transform(test_features)
print(std_train_features)

In [None]:
from sklearn.preprocessing import MinMaxScaler
norm_scaler = StandardScaler().fit(train_features)
norm_train_features = norm_scaler.transform(train_features)
norm_test_features = norm_scaler.transform(test_features)
print(norm_train_features)

In [None]:
from sklearn.linear_model import Ridge
from sklearn.linear_model import LinearRegression

def predictRidge(train, test, labels, col, scaling):
    linear_regressor = Ridge(alpha=0.01, max_iter=1000, solver='saga').fit(train, labels)
    print("Training with %s data converged in %d iterations" % (scaling, linear_regressor.n_iter_))
    scored_test = linear_regressor.predict(test)
    test_dataset[col]=scored_test

def predictOLS(train, test, labels, col, scaling):
    linear_regressor = LinearRegression().fit(train, labels)
    scored_test = linear_regressor.predict(test)
    test_dataset[col]=scored_test

predictRidge(train_features, test_features, train_labels, 'Ridge', 'ridge raw')
predictRidge(std_train_features, std_test_features, train_labels, 'Ridge Std', 'ridge std')
predictRidge(norm_train_features, norm_test_features, train_labels, 'Ridge Norm', 'ridge norm')

predictOLS(train_features, test_features, train_labels, 'OLS', 'raw')
predictOLS(std_train_features, std_test_features, train_labels, 'OLS Std', 'ols std')
predictOLS(norm_train_features, norm_test_features, train_labels, 'OLS Norm', 'ols norm')

test_dataset.sample(10)

#### Predict and compare results

In [None]:
# https://machinelearningmastery.com/prediction-intervals-for-machine-learning/
from numpy import sum as arraysum
from numpy import sqrt
from numpy import mean
from sklearn.metrics import mean_squared_error

def predictionInterval(y, y_pred, name):
    sum_errs = arraysum((y - y_pred)**2)
    stdev = sqrt(1/(len(y)-2) * sum_errs)
    interval = 1.96 * stdev

    print("Prediction interval for %s is %f" % (name, interval))
    print("RMSE for %s is %f" % ( name, mean_squared_error(y, y_pred)))

print("\nRIDGE")
predictionInterval(test_dataset['MPG'], test_dataset['Ridge'], 'Ridge Raw')
predictionInterval(test_dataset['MPG'], test_dataset['Ridge Std'], 'Ridge Std')
predictionInterval(test_dataset['MPG'], test_dataset['Ridge Norm'], 'Ridge Norm')

print("\nOLS")
predictionInterval(test_dataset['MPG'], test_dataset['OLS'], 'OLS Raw')
predictionInterval(test_dataset['MPG'], test_dataset['OLS Std'], 'OLS Std')
predictionInterval(test_dataset['MPG'], test_dataset['OLS Norm'], 'OLS Norm')