# CH08. 차원 축소

### 설정

In [None]:
# 파이썬 ≥3.5 필수 (파이썬 3.7 추천)
import sys
assert sys.version_info >= (3, 5) 

# 사이킷런 ≥0.20 필수
import sklearn
assert sklearn.__version__ >= "0.20"

# 공통 모듈 임포트
import numpy as np
import os

In [None]:
np.random.seed(4)

m = 60
noise = 0.1

X = np.empty((m, 3))

angles = np.random.rand(m) * 3 * np.pi / 2 - 0.5
X[:, 0] = np.cos(angles) + np.sin(angles)/2 + noise * np.random.randn(m) / 2   # x 좌표
X[:, 1] = np.sin(angles) * 0.7 + noise * np.random.randn(m) / 2                # y 좌표

w1, w2 = 0.1, 0.3
X[:, 2] = X[:, 0] * w1 + X[:, 1] * w2 + noise * np.random.randn(m)             # z 좌표 (초평면 + 잡음)

#### 8.3.4 사이킷런 사용하기

In [None]:
X_centered = X - X.mean(axis=0)

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
X2D = pca.fit_transform(X)

In [None]:
X2D.mean(0)

#### 8.3.6 적절한 차원 수 선택하기

In [None]:
from sklearn.datasets import fetch_openml

mnist = fetch_openml('mnist_784', version=1, as_frame=False)

In [None]:
mnist.target = mnist.target.astype(np.uint8)

X = mnist["data"]
y = mnist["target"]

차원축소를 위한 적절한 차원을 확인하기 위해 설명 분산 비율이 95%가 되는
지점까지 몇 개의 주성분이 필요한가를 계산한다.
MNIST 데이터셋의 경우 설명 분산 비율이 95%가 되도록 하려면
154개의 주성분이 필요함이 아래와 같이 확인된다.

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
pca = PCA()
pca.fit(X_train)

cumsum = np.cumsum(pca.explained_variance_ratio_)
d = np.argmax(cumsum >= 0.95) + 1
d

In [None]:
pca = PCA(n_components=0.95)
X_reduced=pca.fit_transform(X_train)

#### 8.3.7 압축을 위한 PCA

In [None]:
pca = PCA(n_components = 154)
X_reduced = pca.fit_transform(X_train)

X_recovered = pca.inverse_transform(X_reduced)

np.mean(np.sum(np.square(X_recovered - X_train), axis=1))

#### 8.3.8 랜덤 PCA

In [None]:
rnd_pca = PCA(n_components=154, svd_solver="randomized", random_state=42)
X_reduced = rnd_pca.fit_transform(X_train)

#### 8.3.9 점진적 PCA

In [None]:
from sklearn.decomposition import IncrementalPCA

n_batches = 100                             # 배치 개수. 미니 배치 크기 = 52,500/100 = 525.
inc_pca = IncrementalPCA(n_components=154)  # 154개의 주성분을 사용하는 모델 지정

for X_batch in np.array_split(X_train, n_batches):
    print(".", end="")                      # 학습 진도를 보여주는 용도
    inc_pca.partial_fit(X_batch)            # partial_fit() 메서드 사용

X_reduced = inc_pca.transform(X_train)      # 154 차원으로 사영하기

#### 8.4 커널 PCA

In [None]:
from sklearn.decomposition import KernelPCA

rbf_pca = KernelPCA(n_components = 2, kernel="rbf", gamma=0.04)
X_reduced = rbf_pca.fit_transform(X)

#### 8.4.1 커널 선택과 하이퍼 파라미터 튜닝

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

clf = Pipeline([
        ("kpca", KernelPCA(n_components=2)),
        ("log_reg", LogisticRegression(solver="lbfgs"))
    ])

param_grid = [{
        "kpca__gamma": np.linspace(0.03, 0.05, 10),
        "kpca__kernel": ["rbf", "sigmoid"]
    }]

grid_search = GridSearchCV(clf, param_grid, cv=3)
grid_search.fit(X, y)

In [None]:
rbf_pca = KernelPCA(n_components = 2, kernel="rbf", gamma=0.0433,
                    fit_inverse_transform=True)
X_reduced = rbf_pca.fit_transform(X)

X_preimage = rbf_pca.inverse_transform(X_reduced)   # 재구성

In [None]:
from sklearn.metrics import mean_squared_error

mean_squared_error(X, X_preimage)   # 재구성 오차가 거의 0임

#### 8.5. LLE

In [None]:
X, t = make_swiss_roll(n_samples=1000, noise=0.2, random_state=41)

In [None]:
from sklearn.manifold import LocallyLinearEmbedding

lle = LocallyLinearEmbedding(n_components=2, n_neighbors=10, random_state=42)
X_reduced = lle.fit_transform(X)

In [None]:
def make_swiss_roll(n_samples=100, *, noise=0.0, random_state=None):
    return X, t