# Preprocessing and scaling
Follow _Introduction to Machine Learning_  
- [Chapter 3](https://github.com/amueller/introduction_to_ml_with_python/blob/master/03-unsupervised-learning.ipynb) Section 3.3 Preprocessing and Scaling


Some algorithms are sensitive to the scaling of numerical features:
- PCA
- SVM
- Neural networks
- Regularized models

Preprocessing is used to bring numerical features to a similar scale.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import mglearn

### Different scalers 

In [None]:
mglearn.plots.plot_scaling()

1. StandardScaler: Mean of zero, variance of one
2. MinMaxScaler: Minimum of zero, maximum of one
3. RobustScaler: Median of zero, interquartile range of one
4. Normalizer: Each sample (row) has unit norm.



In [None]:
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Normalizer.html

# original row [4, 1, 2, 2]
# transformed row has length 1
row = np.array([0.8, 0.2, 0.4, 0.4])
np.sqrt(np.sum(row**2))

### Applying data transformations

In [None]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
cancer = load_breast_cancer()

X_train, X_val, y_train, y_val = train_test_split(cancer.data, cancer.target,
                                                    random_state=1)
print(X_train.shape)
print(X_val.shape)

**Using a minmax scaler**

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

In [None]:
scaler.fit(X_train)

In [None]:
# transform data
X_train_scaled = scaler.transform(X_train)
# print dataset properties before and after scaling
print("transformed shape: {}".format(X_train_scaled.shape))
print("per-feature minimum before scaling:\n {}".format(X_train.min(axis=0)))
print("per-feature maximum before scaling:\n {}".format(X_train.max(axis=0)))
print("per-feature minimum after scaling:\n {}".format(
    X_train_scaled.min(axis=0)))
print("per-feature maximum after scaling:\n {}".format(
    X_train_scaled.max(axis=0)))

**Transform the validation data with the _trained_ scaler**

In [None]:
# transform validation data
X_val_scaled = scaler.transform(X_val)
# print validation data properties after scaling
print("per-feature minimum after scaling:\n{}".format(X_val_scaled.min(axis=0)))
print("per-feature maximum after scaling:\n{}".format(X_val_scaled.max(axis=0)))

### Important: Scalers are learned - do not call fit with validation/test data

Else the data is changed.

In [None]:
from sklearn.datasets import make_blobs
# make synthetic data
X, _ = make_blobs(n_samples=50, centers=5, random_state=4, cluster_std=2)
# split it into training and Validation sets
X_train, X_val = train_test_split(X, random_state=5, test_size=.1)

# plot the training and Validation sets
fig, axes = plt.subplots(1, 3, figsize=(13, 4))
axes[0].scatter(X_train[:, 0], X_train[:, 1],
                color=mglearn.cm2(0), label="Training set", s=60)
axes[0].scatter(X_val[:, 0], X_val[:, 1], marker='^',
                color=mglearn.cm2(1), label="Validation set", s=60)
axes[0].legend(loc='upper left')
axes[0].set_title("Original Data")

# scale the data using MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_val_scaled = scaler.transform(X_val)

# visualize the properly scaled data
axes[1].scatter(X_train_scaled[:, 0], X_train_scaled[:, 1],
                color=mglearn.cm2(0), label="Training set", s=60)
axes[1].scatter(X_val_scaled[:, 0], X_val_scaled[:, 1], marker='^',
                color=mglearn.cm2(1), label="Validation set", s=60)
axes[1].set_title("Scaled Data")

# rescale the Validation set separately
# so Validation set min is 0 and Validation set max is 1
# DO NOT DO THIS! For illustration purposes only.
val_scaler = MinMaxScaler()
val_scaler.fit(X_val)
X_val_scaled_badly = val_scaler.transform(X_val)

# visualize wrongly scaled data
axes[2].scatter(X_train_scaled[:, 0], X_train_scaled[:, 1],
                color=mglearn.cm2(0), label="training set", s=60)
axes[2].scatter(X_val_scaled_badly[:, 0], X_val_scaled_badly[:, 1],
                marker='^', color=mglearn.cm2(1), label="Validation set", s=60)
axes[2].set_title("Improperly Scaled Data")

for ax in axes:
    ax.set_xlabel("Feature 0")
    ax.set_ylabel("Feature 1")
fig.tight_layout()

### Side note: `fit_transform()` for efficiency

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# calling fit and transform in sequence (using method chaining)
X_scaled = scaler.fit(X_train).transform(X_train)
# same result, but more efficient computation
X_scaled_d = scaler.fit_transform(X_train)

### Effect of scaling on supervised learning

**Without feature scaling**

In [None]:
from sklearn.svm import SVC

X_train, X_val, y_train, y_val = train_test_split(cancer.data, cancer.target,
                                                    random_state=0)

svm = SVC(C=100)
svm.fit(X_train, y_train)
print("Validation set accuracy: {:.2f}".format(svm.score(X_val, y_val)))

**With MinMax feature scaling**

In [None]:
# preprocessing using 0-1 scaling
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_val_scaled = scaler.transform(X_val)

# learning an SVM on the scaled training data
svm.fit(X_train_scaled, y_train)

# scoring on the scaled Validation set
print("Scaled Validation set accuracy: {:.2f}".format(
    svm.score(X_val_scaled, y_val)))

**With Standard feature scaling**

In [None]:
# preprocessing using zero mean and unit variance scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_val_scaled = scaler.transform(X_val)

# learning an SVM on the scaled training data
svm.fit(X_train_scaled, y_train)

# scoring on the scaled Validation set
print("SVM validation accuracy: {:.2f}".format(svm.score(X_val_scaled, y_val)))

## Excercise: apply feature scaling to energy dataset 

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

def get_regressor_mse(model, X, y, scaler):
    '''Calculate train and validation mean-squared error (mse) of regressor (model)
        
        Splits feature matrix X and target vector y 
        with sklearn train_test_split() and random_state=956.
        
        Applies a sklearn scaler
        
        model (sklearn regressor): Regressor to train and evaluate
        X (numpy.array or pandas.DataFrame): Feature matrix
        y (numpy.array or pandas.Series): Target vector
        
        returns: training mse, validation mse
    
    '''
    # Split first
    X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=956)
    
    # TODO: Learn scaler on training set
    
    # TODO: Transform validation set with learned scaler
    
    
    model.fit(X_train_scaled, y_train)
    y_train_pred = model.predict(X_train_scaled)
    y_val_pred = model.predict(X_val_scaled)

    return mean_squared_error(y_train, y_train_pred), mean_squared_error(y_val, y_val_pred)

In [None]:
from yellowbrick.datasets import load_energy

X, y = load_energy()
print('X.shape={}, type(X)={}'.format(X.shape, type(X)))
print('y.shape={}, type(y)={}'.format(y.shape, type(y)))

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
models = [LinearRegression(), RandomForestRegressor(random_state=88)]

# TODO: add a list of scalers to iterate


# Iterate all models
for model in models:
    # Iterate all scalers
    for scaler in scalers:
        train, val = get_regressor_mse(model, X, y, scaler)
        print("{} - {}, train_score={:.1f}, validation_score={:.1f}".format(model.__class__.__name__,
                                                                            scaler.__class__.__name__,
                                                                                    train,
                                                                                    val))


Scaling does not seem to make a difference