# Filter Methods

Breast Cancer Dataset를 이용하여 특성 선택 후 정확도를 비교합니다.

In [13]:
from sklearn.datasets import load_breast_cancer # 데이터 세트
from sklearn.preprocessing import StandardScaler # 전처리 : 스케일러
from sklearn.model_selection import train_test_split # 분할
from sklearn.neighbors import KNeighborsClassifier # 모델
from sklearn.metrics import accuracy_score # 평가

cancer = load_breast_cancer() # 데이터 가져오기
X_train, X_test, y_train, y_test = train_test_split(
    cancer.data, cancer.target, stratify=cancer.target, random_state=66
)

scaler = StandardScaler() # 스케일링
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
X_train.shape

(426, 30)

In [14]:
# 모델 만들기
clf = KNeighborsClassifier(n_neighbors=3)
clf.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=3)

In [15]:
y_train_hat = clf.predict(X_train)
print("훈련 정확도 : ", accuracy_score(y_train, y_train_hat))
y_test_hat = clf.predict(X_test)
print("테스트 정확도 : ", accuracy_score(y_test,y_test_hat))

훈련 정확도 :  0.9882629107981221
테스트 정확도 :  0.9440559440559441


일변량 분석을 위해 `F-값`을 나타냅니다.

In [16]:
from sklearn.feature_selection import SelectKBest, f_classif
select = SelectKBest(f_classif, k=20)
select.fit(X_train, y_train)
select.scores_

array([4.83462863e+02, 7.30185143e+01, 5.28179103e+02, 4.47936141e+02,
       8.31684872e+01, 2.82587643e+02, 4.03951190e+02, 6.77339474e+02,
       6.14526934e+01, 6.01858277e-01, 2.31866351e+02, 1.04667244e+00,
       2.34005578e+02, 2.98127849e+02, 1.51979304e+00, 4.25157374e+01,
       2.41668418e+01, 7.54074868e+01, 4.75614984e-03, 2.31919142e+00,
       6.53070215e+02, 1.08039666e+02, 7.04457051e+02, 5.24963982e+02,
       1.37073609e+02, 2.95079487e+02, 3.73197273e+02, 7.97341589e+02,
       1.26635084e+02, 7.49205074e+01])

In [17]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif
select = SelectKBest(mutual_info_classif, k=20)
select.fit(X_train, y_train)
select.scores_

array([3.78863410e-01, 5.78605164e-02, 3.96182469e-01, 3.63430316e-01,
       7.69746994e-02, 2.52575888e-01, 3.92934036e-01, 4.31602803e-01,
       6.69654526e-02, 9.56310915e-03, 2.39691599e-01, 0.00000000e+00,
       2.39577920e-01, 3.31316715e-01, 1.16813610e-04, 7.00022462e-02,
       1.56576929e-01, 8.77306625e-02, 2.02226981e-02, 4.80665303e-02,
       4.64394479e-01, 1.13355545e-01, 4.92123199e-01, 4.70964083e-01,
       1.17586025e-01, 2.84393372e-01, 3.24732980e-01, 4.60988443e-01,
       1.43285956e-01, 9.17416235e-02])

In [18]:
from sklearn.feature_selection import SelectKBest, f_classif
select = SelectKBest(f_classif, k=20)
select.fit(X_train, y_train)

X_train_selected = select.transform(X_train)
X_test_selected = select.transform(X_test)

In [19]:
# 모델 만들기
clf = KNeighborsClassifier(n_neighbors=3)
clf.fit(X_train_selected, y_train)

KNeighborsClassifier(n_neighbors=3)

In [20]:
y_train_hat = clf.predict(X_train_selected)
print("훈련 정확도 : ", accuracy_score(y_train, y_train_hat))
y_test_hat = clf.predict(X_test_selected)
print("테스트 정확도 : ", accuracy_score(y_test,y_test_hat))

훈련 정확도 :  0.9859154929577465
테스트 정확도 :  0.958041958041958


# SelectFromModel
- `get_support()` : 어떤 특성이 선택되었는지 불린 값으로 나타냅니다.

In [21]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
fmodel = RandomForestClassifier()
select = SelectFromModel(fmodel, threshold="mean")
select.fit(X_train, y_train)
X_train_selected = select.transform(X_train)
X_test_selected = select.transform(X_test)
select.get_support()

array([ True, False,  True,  True, False, False,  True,  True, False,
       False, False, False, False, False, False, False, False, False,
       False, False,  True, False,  True,  True, False, False, False,
        True, False, False])

- 중요도를 지정한 임계치보다 큰 모든 특성을 선택했습니다.

In [23]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LinearRegression
fmodel = LinearRegression()
select = SelectFromModel(fmodel, threshold="0.5*mean")
select.fit(X_train, y_train)
X_train_selected = select.transform(X_train)
X_test_selected = select.transform(X_test)
select.get_support()

array([ True, False,  True,  True, False,  True,  True, False, False,
       False,  True, False,  True, False,  True, False,  True, False,
       False, False,  True, False,  True,  True, False, False, False,
       False, False,  True])

In [24]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
fmodel = RandomForestClassifier()
fmodel.fit(X_train, y_train)
fmodel.feature_importances_

array([0.02290933, 0.01109211, 0.07960371, 0.05586273, 0.00437963,
       0.02049359, 0.05432639, 0.06645534, 0.00346024, 0.00263507,
       0.00498712, 0.0037581 , 0.00863558, 0.03030707, 0.00212948,
       0.00380491, 0.0113842 , 0.00243758, 0.00200863, 0.00488179,
       0.1314674 , 0.01774142, 0.14040145, 0.07583133, 0.01444498,
       0.01515403, 0.02748709, 0.1683061 , 0.00932534, 0.0042883 ])

In [26]:
select = SelectFromModel(fmodel, prefit=True, threshold="mean")
X_train_selected = select.transform(X_train)
X_test_selected = select.transform(X_test)
select.get_support()

array([False, False,  True,  True, False, False,  True,  True, False,
       False, False, False, False, False, False, False, False, False,
       False, False,  True, False,  True,  True, False, False, False,
        True, False, False])