# Standard Scaler 
(centering mean on zero and std dev on 1 for each variable)

In [1]:
#The function scale provides a quick and easy way to perform this operation on a single array-like dataset:


from sklearn import preprocessing
import numpy as np

#Build dataset with three columns and three rows
#Structure is visually the same as a typical dataframe (i.e.-columns are up and down and rows side to side)

X_train = np.array([[ 1., -1.,  2.],
                    [ 2.,  0.,  0.],
                    [ 0.,  1., -1.]])
X_scaled = preprocessing.scale(X_train)

X_scaled

array([[ 0.        , -1.22474487,  1.33630621],
       [ 1.22474487,  0.        , -0.26726124],
       [-1.22474487,  1.22474487, -1.06904497]])

To calculate the standard scaler subtract mean from observation value and divide resutl by standard deviation.
Scaled data has zero mean and unit variance:

In [2]:
print(X_scaled.mean(axis=0)) # axis=0 refers to columns
print(X_scaled.std(axis=0))



[0. 0. 0.]
[1. 1. 1.]


# Using StandardScaler()
The preprocessing module further provides a utility class StandardScaler that implements the Transformer API to compute the mean and standard deviation on a training set so as to be able to later reapply the same transformation on the testing set. This class is hence suitable for use in the early steps of a sklearn.pipeline.Pipeline:

In [3]:
#Here we set up the the standard scaler to the X_train data using fit()

scaler = preprocessing.StandardScaler().fit(X_train)
print(scaler) # show details of scaler object




#Here we apply the fit standard scaler to the X_train data using transform()

scaler.transform(X_train)                         

# print(scaler.mean_) #print the means  per column                           

     

StandardScaler(copy=True, with_mean=True, with_std=True)


array([[ 0.        , -1.22474487,  1.33630621],
       [ 1.22474487,  0.        , -0.26726124],
       [-1.22474487,  1.22474487, -1.06904497]])

In [4]:
#The scaler instance can then be used on new data to transform it the same way it did on the training set:
#Note that we are scaling new data to the scale built from the training data.  

X_test = [[-1., 1., 0.]]
scaler.transform(X_test) # Transform x_test before running a model, for example        

#It is also possible to disable either centering or scaling by either passing with_mean=False or with_std=False
#to the constructor of StandardScaler.

array([[-2.44948974,  1.22474487, -0.26726124]])

In [32]:
# Use transformed data with a model:

from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split

boston = load_boston()
X, y = boston.data, boston.target

X_train, X_test, y_train, y_test = train_test_split(X, y)
scaler = preprocessing.StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
ridge = Ridge().fit(X_train_scaled, y_train)
X_test_scaled = scaler.transform(X_test)
ridge.score(X_test_scaled, y_test)


0.6280531523093563

# Using MinMaxScaler
An alternative standardization is scaling features to lie between a given minimum and maximum value, often between zero and one, or so that the maximum absolute value of each feature is scaled to unit size. This can be achieved using MinMaxScaler.

In [5]:
# Here is an example to scale an example data matrix to the [0, 1] range:

X_train = np.array([[ 1., -1.,  2.],
                    [ 2.,  0.,  0.],
                    [ 0.,  1., -1.]])

min_max_scaler = preprocessing.MinMaxScaler()
X_train_minmax = min_max_scaler.fit_transform(X_train) # fit_transform does both at once.  It's a little faster.
X_train_minmax

array([[0.5       , 0.        , 1.        ],
       [1.        , 0.5       , 0.33333333],
       [0.        , 1.        , 0.        ]])

In [19]:
# And again we can then use the scaler to transform new data.
#Note once more that we are scaling new data to the scale built from the training data.  

X_test = np.array([[ -3., -1.,  4.]])
X_test_minmax = min_max_scaler.transform(X_test)
X_test_minmax

array([[-1.5       ,  0.        ,  1.66666667]])

The formula for a min-max transformation is as follows:


For each value in a column of X, subtract the minimium value of the column and divide result by the the result of the maximum value minus the minimum value of the column
(X observation value - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))

In [6]:
# Use transformed data with a model:

from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split

boston = load_boston()
X, y = boston.data, boston.target

X_train, X_test, y_train, y_test = train_test_split(X, y)
scaler = preprocessing.MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
ridge = Ridge().fit(X_train_scaled, y_train)
X_test_scaled = scaler.transform(X_test)
ridge.score(X_test_scaled, y_test)

NameError: name 'Ridge' is not defined

# Using scikit-learn Pipelines

In [7]:
#Here is the code for a ridge model with the standard transformation...

from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler #now we don't need to add preprocessing. before calls to StandardScaler()

X, y = boston.data, boston.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
ridge = Ridge().fit(X_train_scaled, y_train)
X_test_scaled = scaler.transform(X_test)
ridge.score(X_test_scaled, y_test)


0.6344884687786745

In [8]:
# Here is the much cleaner pipeline version:

from sklearn.pipeline import make_pipeline

pipe = make_pipeline(StandardScaler(), Ridge())
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

0.6344884687786745

# Pipeline and GridSearchCV


In [None]:
# We need to pay attention to names of pipeline steps when we use GridSearchCV

In [9]:
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsRegressor

knn_pipe = make_pipeline(StandardScaler(), KNeighborsRegressor())
print(knn_pipe.steps) # names in single quotes (i.e.-'standardscaler' and 'kneighborsregressor')

[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('kneighborsregressor', KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=5, p=2,
          weights='uniform'))]


In [10]:
from sklearn.model_selection import GridSearchCV

knn_pipe = make_pipeline(StandardScaler(), KNeighborsRegressor())

#refer to step name with two underscores before argument name when...
#you build a parameter grid

param_grid = {'kneighborsregressor__n_neighbors': range(1, 10)}
grid = GridSearchCV(knn_pipe, param_grid, cv=10)
grid.fit(X_train, y_train)
print(grid.best_params_)
print(grid.score(X_test, y_test))

{'kneighborsregressor__n_neighbors': 7}
0.6000157533909847
