차원축소 - Breast Cancer Data

In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [2]:
cancer = load_breast_cancer()
df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
df['target'] = cancer.target
df.tail(3)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
566,16.6,28.08,108.3,858.1,0.08455,0.1023,0.09251,0.05302,0.159,0.05648,...,34.12,126.7,1124.0,0.1139,0.3094,0.3403,0.1418,0.2218,0.0782,0
567,20.6,29.33,140.1,1265.0,0.1178,0.277,0.3514,0.152,0.2397,0.07016,...,39.42,184.6,1821.0,0.165,0.8681,0.9387,0.265,0.4087,0.124,0
568,7.76,24.54,47.92,181.0,0.05263,0.04362,0.0,0.0,0.1587,0.05884,...,30.37,59.16,268.6,0.08996,0.06444,0.0,0.0,0.2871,0.07039,1


In [3]:
#  정규화cancer
cancer_std = StandardScaler().fit_transform(cancer.data)
cancer_std.shape

(569, 30)

- PCA로 차원축소

In [4]:
for a in [2,5,10]:
    pca = PCA(n_components=a)
    cancer_pca = pca.fit_transform(cancer_std)
    print(f'PCA {a} - 설명력', sum(pca.explained_variance_ratio_).round(5))

PCA 2 - 설명력 0.63243
PCA 5 - 설명력 0.84734
PCA 10 - 설명력 0.95157


- 원본 데이터로 분류

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    cancer_std, cancer.target, stratify=cancer.target, random_state=2021
)
lr = LogisticRegression(random_state=2021)
lr.fit(X_train, y_train)
score = lr.score(X_test, y_test)
print(f'정확도 : {score:.4f}')

정확도 : 0.9930


- 2, 5, 10 차원으로 축소한 데이터로 분류

In [6]:
# 함수
def pca_accuracy(X, y):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, stratify=y, random_state=2021
    )
    lr = LogisticRegression(random_state=2021)
    lr.fit(X_train, y_train)
    score = lr.score(X_test, y_test)
    return score

In [7]:
for i in [2,5,10]:
    pca = PCA(n_components=i)
    cancer_pca = pca.fit_transform(cancer_std)
    explained = sum(pca.explained_variance_ratio_)
    acc = pca_accuracy(cancer_pca, cancer.target)
    print(f'PCA {i} - 설명력 : {explained:.4f} 정확도 : {acc:.4f}')

PCA 2 - 설명력 : 0.6324 정확도 : 0.9441
PCA 5 - 설명력 : 0.8473 정확도 : 0.9930
PCA 10 - 설명력 : 0.9516 정확도 : 0.9860
