# 차원 축소 (Dimension Reduction)

In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
from sklearn.datasets import load_iris
iris = load_iris()

# 넘파이 데이터 셋을 Pandas DataFrame으로 변환
columns = ['sepal_length','sepal_width','petal_length','petal_width']
irisDF = pd.DataFrame(iris.data, columns=columns)
irisDF['target']=iris.target
irisDF.head(3)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0


In [5]:
del irisDF['target']
irisDF.head(3)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2


### 공분산 행렬

In [27]:
from sklearn.preprocessing import StandardScaler

iris_std = StandardScaler().fit_transform(irisDF)
iris_std[:3, :]

array([[-0.90068117,  1.01900435, -1.34022653, -1.3154443 ],
       [-1.14301691, -0.13197948, -1.34022653, -1.3154443 ],
       [-1.38535265,  0.32841405, -1.39706395, -1.3154443 ]])

In [40]:
iris_cov = np.dot(iris_scaled.T, iris_scaled) / len(iris_scaled)
iris_cov

array([[ 1.        , -0.11756978,  0.87175378,  0.81794113],
       [-0.11756978,  1.        , -0.4284401 , -0.36612593],
       [ 0.87175378, -0.4284401 ,  1.        ,  0.96286543],
       [ 0.81794113, -0.36612593,  0.96286543,  1.        ]])

In [41]:
iris_cov1 = np.dot(iris_scaled.T, iris_scaled) / (len(iris_scaled)-1)
iris_cov1

array([[ 1.00671141, -0.11835884,  0.87760447,  0.82343066],
       [-0.11835884,  1.00671141, -0.43131554, -0.36858315],
       [ 0.87760447, -0.43131554,  1.00671141,  0.96932762],
       [ 0.82343066, -0.36858315,  0.96932762,  1.00671141]])

In [42]:
np.cov(iris_std.T)

array([[ 1.00671141, -0.11835884,  0.87760447,  0.82343066],
       [-0.11835884,  1.00671141, -0.43131554, -0.36858315],
       [ 0.87760447, -0.43131554,  1.00671141,  0.96932762],
       [ 0.82343066, -0.36858315,  0.96932762,  1.00671141]])

### 고유벡터(eigenvector), 고유값(eigenvalue) 구하기

In [43]:
eig_vals, eig_vecs = np.linalg.eig(iris_cov)

In [44]:
eig_vals

array([2.91849782, 0.91403047, 0.14675688, 0.02071484])

In [45]:
eig_vecs

array([[ 0.52106591, -0.37741762, -0.71956635,  0.26128628],
       [-0.26934744, -0.92329566,  0.24438178, -0.12350962],
       [ 0.5804131 , -0.02449161,  0.14212637, -0.80144925],
       [ 0.56485654, -0.06694199,  0.63427274,  0.52359713]])

In [46]:
eig_vals1, eig_vecs1 = np.linalg.eig(iris_cov1)

In [47]:
eig_vals1

array([2.93808505, 0.9201649 , 0.14774182, 0.02085386])

In [48]:
eig_vecs1

array([[ 0.52106591, -0.37741762, -0.71956635,  0.26128628],
       [-0.26934744, -0.92329566,  0.24438178, -0.12350962],
       [ 0.5804131 , -0.02449161,  0.14212637, -0.80144925],
       [ 0.56485654, -0.06694199,  0.63427274,  0.52359713]])

In [69]:
PC1 = iris_std.dot(np.reshape(eig_vecs.T[0], (4, 1)))
PC2 = iris_std.dot(np.reshape(eig_vecs.T[1], (4, 1)))

In [70]:
PC1[:3]

array([[-2.26470281],
       [-2.08096115],
       [-2.36422905]])

In [71]:
# 사이킷런 수행 결과와 비교할 때 부호가 반대로 됨???
PC2[:3]

array([[-0.4800266 ],
       [ 0.67413356],
       [ 0.34190802]])

### PCA 변환

In [53]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)

#fit( )과 transform( ) 을 호출하여 PCA 변환 데이터 반환
pca.fit(iris_scaled)
iris_pca = pca.transform(iris_std)
print(iris_pca.shape)

(150, 2)


In [54]:
iris_pca[:3, :]

array([[-2.26470281,  0.4800266 ],
       [-2.08096115, -0.67413356],
       [-2.36422905, -0.34190802]])