<a href="https://colab.research.google.com/github/coen2812023/PCA/blob/main/PCA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np

from scipy import linalg as la

## Array 

In [2]:
data=np.array([
[6., 3., 2.],
[3., 2., 7.],
[5., 4., 2.],
[1., 4., 3.],
[7., 3., 1.0],
[5., 1., 8.],
[4., 2., 2.],
[8., 6., 6.],
[6., 3., 2.],
[7., 1., 1.]])

In [3]:
# Calculate the covariance matrix
# Center your data
data -= data.mean(axis=0)
cov = np.cov(data, rowvar=False)

In [4]:
cov

array([[ 4.4       ,  0.46666667, -0.97777778],
       [ 0.46666667,  2.32222222, -0.06666667],
       [-0.97777778, -0.06666667,  6.71111111]])

# Calculate the eigenvalues and eigenvector of the covariance matrix:

In [5]:
evals, evecs = la.eig(cov)

In [6]:
evals

array([7.07986812+0.j, 4.13602373+0.j, 2.21744149+0.j])

In [7]:
evecs

array([[ 0.34964832,  0.90980802, -0.22359656],
       [ 0.04740747,  0.22117341,  0.97408154],
       [-0.93568081,  0.35118612, -0.03420122]])

In [8]:
# Sort the Eigen values and vector and select components
num_components=2
sorted_key = np.argsort(evals)[::-1][:num_components]
evals, evecs = evals[sorted_key], evecs[:, sorted_key]

# sorted eigen values


In [9]:
evals

array([7.07986812+0.j, 4.13602373+0.j])

## sorted eigenvectors

In [10]:
evecs

array([[ 0.34964832,  0.90980802],
       [ 0.04740747,  0.22117341],
       [-0.93568081,  0.35118612]])

In [11]:
# Multiply original data and Eigen vectors to get principal components
principal_components=np.dot(data,evecs)
print("Principal Components:", principal_components)

Principal Components: [[ 1.59441254  0.25830318]
 [-4.18034395 -0.93636366]
 [ 1.29217169 -0.43033142]
 [-1.04210241 -3.71837737]
 [ 2.87974168  0.81692508]
 [-4.46413559  1.01326508]
 [ 0.84770843 -1.78248626]
 [-1.30679167  4.14618395]
 [ 1.59441254  0.25830318]
 [ 2.78492674  0.37457825]]


## Implementation in Scikit Learn

In [12]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.datasets import load_wine

In [13]:
data = load_wine()
df = pd.DataFrame(data.data, columns=data.feature_names)

In [14]:
df.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0


In [15]:
# scale
x=StandardScaler().fit_transform(df)

In [16]:
# apply PCA
pca=PCA(n_components=2)
principal_components=pca.fit_transform(x)

In [17]:
principalDf = pd.DataFrame(data = principal_components, columns = ['principal component 1', 'principal component 2'])

In [18]:
principalDf

Unnamed: 0,principal component 1,principal component 2
0,3.316751,-1.443463
1,2.209465,0.333393
2,2.516740,-1.031151
3,3.757066,-2.756372
4,1.008908,-0.869831
...,...,...
173,-3.370524,-2.216289
174,-2.601956,-1.757229
175,-2.677839,-2.760899
176,-2.387017,-2.297347
