In [1]:
# Rescale data for the same range using MinMaxScaler
# Useful for algorithms that weigh inputs such as regression, K-NN

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [2]:
#Read data
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pd.read_csv('pima-indians-diabetes.data.csv', names = names)

In [6]:
dataframe.head()

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [7]:
# change dataframe to array
array_df = dataframe.values

In [8]:
array_df

array([[  6.   , 148.   ,  72.   , ...,   0.627,  50.   ,   1.   ],
       [  1.   ,  85.   ,  66.   , ...,   0.351,  31.   ,   0.   ],
       [  8.   , 183.   ,  64.   , ...,   0.672,  32.   ,   1.   ],
       ...,
       [  5.   , 121.   ,  72.   , ...,   0.245,  30.   ,   0.   ],
       [  1.   , 126.   ,  60.   , ...,   0.349,  47.   ,   1.   ],
       [  1.   ,  93.   ,  70.   , ...,   0.315,  23.   ,   0.   ]])

In [11]:
# seperate into X(input/predictor) and Y(output)
X = array_df[:,0:8] #end not included - only 0 up to 7
Y = array_df[:,8]

In [18]:
#start scaling
scaler = MinMaxScaler(feature_range = (0,1))
rescaledX = scaler.fit_transform(X)

In [19]:
rescaledX

array([[0.35294118, 0.74371859, 0.59016393, ..., 0.50074516, 0.23441503,
        0.48333333],
       [0.05882353, 0.42713568, 0.54098361, ..., 0.39642325, 0.11656704,
        0.16666667],
       [0.47058824, 0.91959799, 0.52459016, ..., 0.34724292, 0.25362938,
        0.18333333],
       ...,
       [0.29411765, 0.6080402 , 0.59016393, ..., 0.390462  , 0.07130658,
        0.15      ],
       [0.05882353, 0.63316583, 0.49180328, ..., 0.4485842 , 0.11571307,
        0.43333333],
       [0.05882353, 0.46733668, 0.57377049, ..., 0.45305514, 0.10119556,
        0.03333333]])

In [20]:
# Standardize data 
# standardizing is to transform to std normal/gaussian ( mean 0 and var 1)
# Suitable for techniques that assume std normal distr in the input variables

# Use Scikitlearn StandardScaler 

from sklearn.preprocessing import StandardScaler

In [21]:
# start standardizing
scaler2 = StandardScaler().fit(X) #.fit to know the transformation
std_X = scaler2.transform(X) # apply to X

In [27]:
# Normalize data
# Normalizing : rescale each observation (row) to have unit norm i.e. length 1
# Useful for sparse datasets (lots of zeros) with attributes of varying scales
# for algo that weigh input values such as neural networks and k-NN
#Scaling inputs to unit norms is a common operation for text classification or clustering for instance. 
from sklearn.preprocessing import Normalizer

In [28]:
# start scaling
scaler3 = Normalizer().fit(X)
normalized_x = scaler3.transform(X)

In [29]:
normalized_x

array([[0.03355237, 0.82762513, 0.40262844, ..., 0.18789327, 0.00350622,
        0.27960308],
       [0.008424  , 0.71604034, 0.55598426, ..., 0.22407851, 0.00295683,
        0.26114412],
       [0.04039768, 0.92409698, 0.32318146, ..., 0.11765825, 0.00339341,
        0.16159073],
       ...,
       [0.02691539, 0.65135243, 0.38758161, ..., 0.14103664, 0.00131885,
        0.16149234],
       [0.00665306, 0.83828547, 0.39918356, ..., 0.20025708, 0.00232192,
        0.31269379],
       [0.00791454, 0.73605211, 0.55401772, ..., 0.24060198, 0.00249308,
        0.18203439]])

In [30]:
# Binarizing data
# using a binary threshold. 
# All values above the threshold are marked 1 and all equal to or below are marked as 0.

from sklearn.preprocessing import Binarizer

In [32]:
# start binarizing
binarizer = Binarizer(threshold=0.0).fit(X) # <= 0 is 0 , > 0 is 1
binaryX = binarizer.transform(X)

In [33]:
binaryX

array([[1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       ...,
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.]])