In [None]:
import numpy as np
import pandas as pd

In [4]:
# import wine data
df_wine = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', header=None)
df_wine.columns = ['Class label', 'Alcohol','Malic acid', 'Ash','Alcalinity of ash', 'Magnesium','Total phenols', 'Flavanoids', 
                  'Nonflavanoid phenols', 'Proanthocyanins', 'Color intensity', 'Hue','OD280/OD315 of diluted wines','Proline']
df_wine

Unnamed: 0,Class label,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.20,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050
2,1,13.16,2.36,2.67,18.6,101,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.50,16.8,113,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,3,13.71,5.65,2.45,20.5,95,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740
174,3,13.40,3.91,2.48,23.0,102,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750
175,3,13.27,4.28,2.26,20.0,120,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835
176,3,13.17,2.59,2.37,20.0,120,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840


In [11]:
X = df_wine.values[:, 1:]
y  = df_wine.values[:,0].astype(int)

In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, stratify=y, random_state=1)

print("X shapes", X_train.shape, X_test.shape)
print("Y shapes", y_train.shape, y_test.shape)

X shapes (124, 13) (54, 13)
Y shapes (124,) (54,)


## Feature Scaling
Feature scaling is a crucial step in our preprocessing pipeline that can easily be forgotten. Decision trees and random forests are two of the very few machine learning algorithms where we don’t need to worry about feature scaling. Those algorithms are scale-invariant. However, the majority of machine learning and optimization algorithms behave much better if features are on the same scale

Now, there are two common approaches to bringing different features onto the same scale: normalization and standardization.
Normalization refers to rescaling of features in the range [0,1]. One such technique is min-max scaling where we scale as follows:

$$
x^{(i)}_{norm} = \frac{x^{(i)} - x_{min} }{x_{max} - x_{min}}
$$

In [21]:
# The min-max scaling procedure is implemented in scikit-learn and can be used as follows:

from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()
X_train_std = mms.fit_transform(X_train)
print("non-normalized: ", X_train[0], "\n", "normalized: ",X_train_std[0])




non-normalized:  [1.437e+01 1.950e+00 2.500e+00 1.680e+01 1.130e+02 3.850e+00 3.490e+00
 2.400e-01 2.180e+00 7.800e+00 8.600e-01 3.450e+00 1.480e+03] 
 normalized:  [0.89784946 0.24643585 0.60962567 0.31958763 0.5308642  0.98965517
 0.66455696 0.19230769 0.55835962 0.62273161 0.30894309 0.79704797
 0.85734665]


The procedure for standardization can be expressed by the following equation:
$$
    x_{j}= \frac{{x_{j} - u_{j}}}{{\mu_{j}}}
$$

In [25]:
ex = np.array([0, 1, 2, 3, 4, 5])
print("standardized: ", (ex - ex.mean()/ex.std()))
print("normalized: ", ((ex - ex.min())/(ex.max()- ex.min())))

standardized:  [-1.46385011 -0.46385011  0.53614989  1.53614989  2.53614989  3.53614989]
normalized:  [0.  0.2 0.4 0.6 0.8 1. ]


In [27]:
#from sklearn
from sklearn.preprocessing import StandardScaler  #more optimized
ss = StandardScaler()
ss.fit(ex.reshape(-1,1))
ss.transform(ex.reshape(-1, 1))

array([[-1.46385011],
       [-0.87831007],
       [-0.29277002],
       [ 0.29277002],
       [ 0.87831007],
       [ 1.46385011]])

standardization maintains useful information about outliers and makes the algorithm less sensitive to them in contrast to min-max scaling, which scales the data to a limited range of values.

Do check out RobustScaler