In [1]:
import pandas as pd
df = pd.read_csv('iris.data.txt')
df.columns=['sepal_len', 'sepal_wid', 'petal_len', 'petal_wid', 'class']
df.shape

(149, 5)

In [2]:
df.head()

Unnamed: 0,sepal_len,sepal_wid,petal_len,petal_wid,class
0,4.9,3.0,1.4,0.2,Iris-setosa
1,4.7,3.2,1.3,0.2,Iris-setosa
2,4.6,3.1,1.5,0.2,Iris-setosa
3,5.0,3.6,1.4,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa


In [3]:
X = df[['sepal_len','sepal_wid','petal_len','petal_wid']]

In [4]:
X.corr()

Unnamed: 0,sepal_len,sepal_wid,petal_len,petal_wid
sepal_len,1.0,-0.103784,0.871283,0.816971
sepal_wid,-0.103784,1.0,-0.415218,-0.350733
petal_len,0.871283,-0.415218,1.0,0.962314
petal_wid,0.816971,-0.350733,0.962314,1.0


In [25]:
X.describe()

Unnamed: 0,sepal_len,sepal_wid,petal_len,petal_wid
count,149.0,149.0,149.0,149.0
mean,5.848322,3.051007,3.774497,1.205369
std,0.828594,0.433499,1.759651,0.761292
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.4,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [5]:
# compute covariance matrix
import numpy as np
mean_vec = np.mean(X, axis=0)
cov_mat = (X - mean_vec).T.dot((X - mean_vec)) / (X.shape[0]-1)
print('Covariance matrix \n%s' %cov_mat)

Covariance matrix 
           sepal_len  sepal_wid  petal_len  petal_wid
sepal_len   0.686568  -0.037279   1.270362   0.515347
sepal_wid  -0.037279   0.187921  -0.316731  -0.115749
petal_len   1.270362  -0.316731   3.096372   1.289124
petal_wid   0.515347  -0.115749   1.289124   0.579566


In [6]:
# engenvector and engienvalues
cov_mat = np.cov(X.T)

print('Covariance matrix \n%s' %cov_mat)

eig_vals, eig_vecs = np.linalg.eig(cov_mat)

print('Eigenvectors \n%s' %eig_vecs)
print('\nEigenvalues \n%s' %eig_vals)

Covariance matrix 
[[ 0.68656811 -0.0372787   1.27036233  0.51534691]
 [-0.0372787   0.18792128 -0.31673091 -0.11574868]
 [ 1.27036233 -0.31673091  3.09637221  1.28912434]
 [ 0.51534691 -0.11574868  1.28912434  0.57956557]]
Eigenvectors 
[[ 0.36263433 -0.6558202  -0.58115529  0.3172613 ]
 [-0.08122848 -0.73001455  0.59619427 -0.32408808]
 [ 0.85629752  0.17703033  0.07265649 -0.47972477]
 [ 0.35868209  0.07509244  0.54911925  0.75111672]]

Eigenvalues 
[4.20438706 0.24314579 0.07905128 0.02384304]


In [7]:
from sklearn.decomposition import PCA
pca = PCA()  
pca.fit(X)    
EV_List = pca.explained_variance_  
EV_List # eigenvalues

array([4.20438706, 0.24314579, 0.07905128, 0.02384304])

In [8]:
EVR_List = [i/sum(EV_List) for i in EV_List]
EVR_List # percent of explained variance

[0.9239543681440451,
 0.053433619322753645,
 0.017372275759703706,
 0.005239736773497557]

In [9]:
# explain 90% variance, need 1 principal component (pc1)
pca = PCA(n_components=1)
pca.fit(X)

PCA(n_components=1)

In [10]:
# loading factors at each principal component (pc1)
pca.components_

array([[ 0.36263433, -0.08122848,  0.85629752,  0.35868209]])

In [11]:
pca.explained_variance_ 

array([4.20438706])

In [12]:
# Apply PCA to the data set
data_pca = pca.transform(X)
data_pca

array([[-2.73363445],
       [-2.90803676],
       [-2.76491784],
       [-2.7461081 ],
       [-2.29679724],
       [-2.83904793],
       [-2.64423265],
       [-2.90682876],
       [-2.69199575],
       [-2.52354747],
       [-2.63112977],
       [-2.80576609],
       [-3.24397251],
       [-2.65975154],
       [-2.39988069],
       [-2.63931625],
       [-2.66585361],
       [-2.21575231],
       [-2.6045924 ],
       [-2.32791942],
       [-2.56060135],
       [-3.23368084],
       [-2.32098224],
       [-2.37424051],
       [-2.52611151],
       [-2.48686648],
       [-2.57982864],
       [-2.65733554],
       [-2.6511475 ],
       [-2.60676122],
       [-2.42744251],
       [-2.66443393],
       [-2.61352803],
       [-2.69199575],
       [-2.88487621],
       [-2.64229784],
       [-2.69199575],
       [-3.00058136],
       [-2.60796922],
       [-2.7877468 ],
       [-2.87158978],
       [-3.01682706],
       [-2.42325291],
       [-2.22620519],
       [-2.73402967],
       [-2

In [13]:
# explain more than 97% variance, need 2 principal components (pc1,pc2)
pca = PCA(n_components=2)  
pca.fit(X)

PCA(n_components=2)

In [14]:
# loading at each principal component
pca.components_

array([[ 0.36263433, -0.08122848,  0.85629752,  0.35868209],
       [ 0.6558202 ,  0.73001455, -0.17703033, -0.07509244]])

In [15]:
pca.explained_variance_  

array([4.20438706, 0.24314579])

In [16]:
# Apply PCA to the data set
data_pca = pca.transform(X)
data_pca

array([[-2.73363445, -0.16331092],
       [-2.90803676, -0.13076902],
       [-2.76491784, -0.30475856],
       [-2.7461081 ,  0.34027983],
       [-2.29679724,  0.75348469],
       [-2.83904793, -0.0755604 ],
       [-2.64423265,  0.17657389],
       [-2.90682876, -0.56422248],
       [-2.69199575, -0.10050325],
       [-2.52354747,  0.65790634],
       [-2.63112977,  0.02770681],
       [-2.80576609, -0.2213837 ],
       [-3.24397251, -0.4961847 ],
       [-2.65975154,  1.19234788],
       [-2.39988069,  1.3506441 ],
       [-2.63931625,  0.82429682],
       [-2.66585361,  0.32535115],
       [-2.21575231,  0.88473854],
       [-2.6045924 ,  0.52665249],
       [-2.32791942,  0.4034959 ],
       [-2.56060135,  0.44614179],
       [-3.23368084,  0.14876388],
       [-2.32098224,  0.11122065],
       [-2.37424051, -0.02540228],
       [-2.52611151, -0.13313497],
       [-2.48686648,  0.14385237],
       [-2.57982864,  0.38073938],
       [-2.65733554,  0.32544096],
       [-2.6511475 ,

In [17]:
df_pca=pd.DataFrame(data_pca, columns=['PC1','PC2'])

In [18]:
df_pca.corr()

Unnamed: 0,PC1,PC2
PC1,1.0,-1.516508e-15
PC2,-1.516508e-15,1.0


In [19]:
pca = PCA(n_components=3)
pca.fit(X)

PCA(n_components=3)

In [20]:
data_pca = pca.transform(X)
data_pca

array([[-2.73363445, -0.16331092, -0.20387761],
       [-2.90803676, -0.13076902,  0.02432666],
       [-2.76491784, -0.30475856,  0.03735406],
       [-2.7461081 ,  0.34027983,  0.09572342],
       [-2.29679724,  0.75348469,  0.17374038],
       [-2.83904793, -0.0755604 ,  0.26385861],
       [-2.64423265,  0.17657389, -0.01624978],
       [-2.90682876, -0.56422248,  0.02708061],
       [-2.69199575, -0.10050325, -0.19190446],
       [-2.52354747,  0.65790634, -0.06985362],
       [-2.63112977,  0.02770681,  0.10724693],
       [-2.80576609, -0.2213837 , -0.200674  ],
       [-3.24397251, -0.4961847 ,  0.0681067 ],
       [-2.65975154,  1.19234788, -0.1452544 ],
       [-2.39988069,  1.3506441 ,  0.28295963],
       [-2.63931625,  0.82429682,  0.14467779],
       [-2.66585361,  0.32535115,  0.03290039],
       [-2.21575231,  0.88473854, -0.11513756],
       [-2.6045924 ,  0.52665249,  0.21902432],
       [-2.32791942,  0.4034959 , -0.2341806 ],
       [-2.56060135,  0.44614179,  0.214

In [21]:
df_pca=pd.DataFrame(data_pca, columns=['PC1','PC2','PC3'])

In [22]:
df_pca.corr()

Unnamed: 0,PC1,PC2,PC3
PC1,1.0,-1.516508e-15,-4.124791e-16
PC2,-1.516508e-15,1.0,5.485189e-16
PC3,-4.124791e-16,5.485189e-16,1.0


In [23]:
pca.components_

array([[ 0.36263433, -0.08122848,  0.85629752,  0.35868209],
       [ 0.6558202 ,  0.73001455, -0.17703033, -0.07509244],
       [-0.58115529,  0.59619427,  0.07265649,  0.54911925]])

In [24]:
pca.explained_variance_  

array([4.20438706, 0.24314579, 0.07905128])