In [1]:
### --- IMPORTANT --- ###
# scikit-learn preprocess:
# http://scikit-learn.org/stable/modules/preprocessing.html#preprocessing

# After finishing the book once, 
# during review of the book, 
# read the above page and make additions below

In [2]:
# 7.1 Need for Data Pre-processing

# - You almost always need to pre-process your data. It is a required step. 

# - A difficulty is that different algorithms make different assumptions about your data 
# and may require different transforms. 

### --- VERY IMPORTANT TO REMEMBER -- ###
# - Generally, it is recommended to create many different views and transforms of data, 
# then exercise a handful of algorithms on each view of dataset. 

# - This will help to flush out which data transforms might be better at exposing the structure of 
# the problem in general.

In [3]:
from pandas import read_csv

In [4]:
# from numpy import set_printoptions

In [5]:
# set_printoptions(precision=3)

In [6]:
# set_printoptions is not required as it does enables the decimal points to the precision defined.
# However, does not suppress scientific notation.
# We can use numpy savetxt as below

In [7]:
import sys

In [8]:
import numpy

In [9]:
def print_data(_data):
    return numpy.savetxt(sys.stdout, _data, '%5.3f')

In [10]:
_uri = 'https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data'

In [11]:
_col_names = ['preg','plas','pres','skin','test','mass','pedi','age','class']

In [12]:
_dataframe = read_csv(_uri, names=_col_names)

In [13]:
type(_dataframe)

pandas.core.frame.DataFrame

In [14]:
_array = _dataframe.values

In [15]:
type(_array)

numpy.ndarray

In [16]:
print_data(_array[0:5,:])

6.000 148.000 72.000 35.000 0.000 33.600 0.627 50.000 1.000
1.000 85.000 66.000 29.000 0.000 26.600 0.351 31.000 0.000
8.000 183.000 64.000 0.000 0.000 23.300 0.672 32.000 1.000
1.000 89.000 66.000 23.000 94.000 28.100 0.167 21.000 0.000
0.000 137.000 40.000 35.000 168.000 43.100 2.288 33.000 1.000


In [17]:
# separate _array into input and output components

In [18]:
_X = _array[:,0:8]

In [19]:
print_data(_X[0:5,:])

6.000 148.000 72.000 35.000 0.000 33.600 0.627 50.000
1.000 85.000 66.000 29.000 0.000 26.600 0.351 31.000
8.000 183.000 64.000 0.000 0.000 23.300 0.672 32.000
1.000 89.000 66.000 23.000 94.000 28.100 0.167 21.000
0.000 137.000 40.000 35.000 168.000 43.100 2.288 33.000


In [20]:
_Y = _array[:,8:]

In [21]:
print_data(_Y[0:5,:])

1.000
0.000
1.000
0.000
1.000


In [22]:
# 7.2 Data Transforms

# The scikit-learn library provides two standard idioms for transforming data.
# 1. Fit and Multiple Transform.
    # - This is the preferred approach.
    # - You call the fit() function to prepare the parameters of the transform once on your data.
    # - Then later you can use the transform() function on the same data to prepare it for modeling
    # and again on the test or validation dataset or new data that you may see in the future.
# 2. Combined Fit-And-Transform.
    # - The Combined Fit-And-Transform is a convenience that you can use for one off tasks. 
    # - This might be useful if you are interested in plotting or summarizing the transformed data.

In [23]:
# 7.3 Rescale Data

# - When your data is comprised of attributes with varying scales, many machine learning algorithms 
# can benefit from rescaling the attributes to all have the same scale. 

# - Often this is referred to as normalization and attributes are often rescaled into the range 
# between 0 and 1. 

# - This is useful for optimization algorithms used in the core of machine learning algorithms 
# like gradient descent. 

# - It is also useful for algorithms that weight inputs like regression and neural networks and 
# algorithms that use distance measures like k-Nearest Neighbors. 

In [24]:
from sklearn.preprocessing import MinMaxScaler

In [25]:
print_data(_X[:5,:])

6.000 148.000 72.000 35.000 0.000 33.600 0.627 50.000
1.000 85.000 66.000 29.000 0.000 26.600 0.351 31.000
8.000 183.000 64.000 0.000 0.000 23.300 0.672 32.000
1.000 89.000 66.000 23.000 94.000 28.100 0.167 21.000
0.000 137.000 40.000 35.000 168.000 43.100 2.288 33.000


In [26]:
_X_rescaled = MinMaxScaler(feature_range=(0,1)).fit_transform(_X)

In [27]:
print_data(_X_rescaled[:5,:]) # rescaled values are between 0 and 1

0.353 0.744 0.590 0.354 0.000 0.501 0.234 0.483
0.059 0.427 0.541 0.293 0.000 0.396 0.117 0.167
0.471 0.920 0.525 0.000 0.000 0.347 0.254 0.183
0.059 0.447 0.541 0.232 0.111 0.419 0.038 0.000
0.000 0.688 0.328 0.354 0.199 0.642 0.944 0.200


In [28]:
# 7.4 Standardize Data

In [29]:
# - Standardization is a useful technique to transform attributes with a Gaussian distribution 
# and differing means and standard deviations to a standard Gaussian distribution with a mean 
# of 0 and a standard deviation of 1. 

# - It is most suitable for techniques that assume a Gaussian distribution in the input variables 
# and work better with rescaled data, such as linear regression, logistic regression and linear 
# discriminate analysis.

In [30]:
from sklearn.preprocessing import StandardScaler

In [31]:
print_data(_X[:5,:])

6.000 148.000 72.000 35.000 0.000 33.600 0.627 50.000
1.000 85.000 66.000 29.000 0.000 26.600 0.351 31.000
8.000 183.000 64.000 0.000 0.000 23.300 0.672 32.000
1.000 89.000 66.000 23.000 94.000 28.100 0.167 21.000
0.000 137.000 40.000 35.000 168.000 43.100 2.288 33.000


In [32]:
_X_rescaled = StandardScaler().fit_transform(_X)

In [33]:
print_data(_X_rescaled[:5,:])

0.640 0.848 0.150 0.907 -0.693 0.204 0.468 1.426
-0.845 -1.123 -0.161 0.531 -0.693 -0.684 -0.365 -0.191
1.234 1.944 -0.264 -1.288 -0.693 -1.103 0.604 -0.106
-0.845 -0.998 -0.161 0.155 0.123 -0.494 -0.921 -1.042
-1.142 0.504 -1.505 0.907 0.766 1.410 5.485 -0.020


In [34]:
# 7.5 Normalize Data

In [35]:
# - Normalizing in scikit-learn refers to rescaling each observation (row) to have a length of 1 
# (called a unit norm or a vector with the length of 1 in linear algebra). 

# - This pre-processing method can be useful for sparse datasets (lots of zeros) with attributes 
# of varying scales when using algorithms that weight input values such as neural networks and 
# algorithms that use distance measures such as k-Nearest Neighbors.

In [36]:
from sklearn.preprocessing import Normalizer

In [37]:
print_data(_X[:5,:])

6.000 148.000 72.000 35.000 0.000 33.600 0.627 50.000
1.000 85.000 66.000 29.000 0.000 26.600 0.351 31.000
8.000 183.000 64.000 0.000 0.000 23.300 0.672 32.000
1.000 89.000 66.000 23.000 94.000 28.100 0.167 21.000
0.000 137.000 40.000 35.000 168.000 43.100 2.288 33.000


In [38]:
_X_normalized = Normalizer().fit_transform(_X)

In [39]:
print_data(_X_normalized[:5,:])

0.034 0.828 0.403 0.196 0.000 0.188 0.004 0.280
0.008 0.716 0.556 0.244 0.000 0.224 0.003 0.261
0.040 0.924 0.323 0.000 0.000 0.118 0.003 0.162
0.007 0.588 0.436 0.152 0.622 0.186 0.001 0.139
0.000 0.596 0.174 0.152 0.731 0.188 0.010 0.144


In [40]:
# for explanation of unit-norm or unit-vector see:
# http://www.algebralab.org/lessons/lesson.aspx?file=Trigonometry_TrigVectorUnits.xml

# To calculate unit vector of a row:
# - square each column value
# - add the squared column values
# - take square root of the sum
# - it should come out to 1

In [41]:
# 7.6 Binarize Data (make binary)

In [42]:
# - You can transform your data using a binary threshold. 
# All values above the threshold are marked 1 and all equal to or below are marked as 0. 

# - It can be useful when you have probabilities that you want to make crisp values. 

# - It is also useful when feature engineering and you want to add new features that indicate something meaningful. 

In [43]:
from sklearn.preprocessing import Binarizer

In [44]:
print_data(_X[0:5,:])

6.000 148.000 72.000 35.000 0.000 33.600 0.627 50.000
1.000 85.000 66.000 29.000 0.000 26.600 0.351 31.000
8.000 183.000 64.000 0.000 0.000 23.300 0.672 32.000
1.000 89.000 66.000 23.000 94.000 28.100 0.167 21.000
0.000 137.000 40.000 35.000 168.000 43.100 2.288 33.000


In [45]:
_X_binarized = Binarizer(threshold=0.0).fit_transform(_X)

In [46]:
print_data(_X_binarized[0:5,:])

1.000 1.000 1.000 1.000 0.000 1.000 1.000 1.000
1.000 1.000 1.000 1.000 0.000 1.000 1.000 1.000
1.000 1.000 1.000 0.000 0.000 1.000 1.000 1.000
1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000
0.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000


In [47]:
_X_binarized = Binarizer(threshold=0.5).fit_transform(_X)

In [48]:
print_data(_X_binarized[0:5,:])

1.000 1.000 1.000 1.000 0.000 1.000 1.000 1.000
1.000 1.000 1.000 1.000 0.000 1.000 0.000 1.000
1.000 1.000 1.000 0.000 0.000 1.000 1.000 1.000
1.000 1.000 1.000 1.000 1.000 1.000 0.000 1.000
0.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000
