# Scaling - Normalization - PCA

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('pima-indians-diabetes.csv')

In [3]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


All the values in this table are of different metrics. Age is measured in years, SkinThicness is in mms etc. Scaling is the process of taking these values and scaling them into a common metric.

In [4]:
from sklearn.preprocessing import normalize, scale

In [5]:
x = df.drop('Outcome', axis=1) # We do not normalize the target column.

In [6]:
x.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


In [7]:
x = scale(x)

In [8]:
pd.DataFrame(x)

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.639947,0.848324,0.149641,0.907270,-0.692891,0.204013,0.468492,1.425995
1,-0.844885,-1.123396,-0.160546,0.530902,-0.692891,-0.684422,-0.365061,-0.190672
2,1.233880,1.943724,-0.263941,-1.288212,-0.692891,-1.103255,0.604397,-0.105584
3,-0.844885,-0.998208,-0.160546,0.154533,0.123302,-0.494043,-0.920763,-1.041549
4,-1.141852,0.504055,-1.504687,0.907270,0.765836,1.409746,5.484909,-0.020496
...,...,...,...,...,...,...,...,...
763,1.827813,-0.622642,0.356432,1.722735,0.870031,0.115169,-0.908682,2.532136
764,-0.547919,0.034598,0.046245,0.405445,-0.692891,0.610154,-0.398282,-0.531023
765,0.342981,0.003301,0.149641,0.154533,0.279594,-0.735190,-0.685193,-0.275760
766,-0.844885,0.159787,-0.470732,-1.288212,-0.692891,-0.240205,-0.371101,1.170732


The resulting table here represents the values of the original table.

In [9]:
norm = normalize(x)

In [10]:
pd.DataFrame(norm)

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.293647,0.389263,0.068664,0.416311,-0.317941,0.093614,0.214973,0.654334
1,-0.458093,-0.609101,-0.087047,0.287852,-0.375682,-0.371091,-0.197934,-0.103381
2,0.408951,0.644218,-0.087479,-0.426959,-0.229648,-0.365657,0.200318,-0.034994
3,-0.425010,-0.502137,-0.080761,0.077736,0.062026,-0.248523,-0.463179,-0.523940
4,-0.186954,0.082528,-0.246360,0.148546,0.125389,0.230816,0.898036,-0.003356
...,...,...,...,...,...,...,...,...
763,0.474619,-0.161678,0.092553,0.447334,0.225917,0.029905,-0.235953,0.657508
764,-0.412899,0.026072,0.034849,0.305535,-0.522147,0.459799,-0.300137,-0.400167
765,0.297611,0.002864,0.129846,0.134092,0.242609,-0.637939,-0.594556,-0.239282
766,-0.391110,0.073968,-0.217909,-0.596333,-0.320749,-0.111194,-0.171788,0.541949


## Principle Component Analysis - PCA

In [11]:
from sklearn.decomposition import PCA

In [12]:
pca = PCA(2)

In [14]:
x = df.drop('Outcome', axis=1)

In [15]:
x.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


In [16]:
x_2 = pca.fit_transform(x)

In [17]:
x_2

array([[-75.71465491, -35.95078264],
       [-82.3582676 ,  28.90821322],
       [-74.63064344, -67.90649647],
       ...,
       [ 32.11319827,   3.3766648 ],
       [-80.21449431, -14.18601977],
       [-81.30814972,  21.62149606]])

In [19]:
pd.DataFrame(x_2, columns=['PCA1', 'PCA2'])

Unnamed: 0,PCA1,PCA2
0,-75.714655,-35.950783
1,-82.358268,28.908213
2,-74.630643,-67.906496
3,11.077423,34.898486
4,89.743788,-2.746937
...,...,...
763,99.237881,25.080927
764,-78.641239,-7.688010
765,32.113198,3.376665
766,-80.214494,-14.186020


We shrinked the original data into two columns. This table is representative of the origianl table, only it was shrinked in columns.

In [21]:
pca.explained_variance_ratio_

array([0.88854663, 0.06159078])

In [22]:
sum(pca.explained_variance_ratio_)

0.9501374183215007

We did the shrinking with 95% percent of the data intact.