# Sequential Feature Selection(SFS)
# 逐一選擇特徵

# 分兩種訓練的方式，來驗證特徵選取後的準確性差異

In [None]:
from sklearn.datasets import load_wine
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import seaborn as sns
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

# 設定中文字型
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']
# 矯正負號
plt.rcParams['axes.unicode_minus'] = False

## 載入資料

*導入*sklearn.數據集，葡萄酒數據集（分類）

In [None]:
X, y = load_wine(return_X_y=True, as_frame=True)

## 資料分割

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X.values, y, test_size=0.5, random_state=42)

## 特徵縮放

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()  # 創建標準化縮放器
X_train_std = scaler.fit_transform(X_train)  # 使用訓練數據擬合並轉換特徵
X_test_std = scaler.transform(X_test)  # 使用相同的轉換器對測試數據進行縮放，測試資料請勿用fit轉換


## 選擇演算法

使用scikit-learn中的LogisticRegression模型，這是一種用於二元分類的線性模型。

In [None]:
from sklearn.linear_model import LogisticRegression
# 創建Logistic Regression模型
clf = LogisticRegression()

## 模型訓練

In [None]:
# 使用訓練數據進行模型訓練
#標準化訓練資料 ( X_train_std) 和對應的標籤 ( y_train)
clf.fit(X_train_std, y_train)

## 模型評估

In [None]:
clf.score(X_test_std, y_test)

0.9887640449438202

## 測試選取3個特徵的所有組合

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SequentialFeatureSelector

knn = KNeighborsClassifier(n_neighbors=11) #特徵空間去算距離，適合少量的特徵
sfs = SequentialFeatureSelector(knn, n_features_to_select=3)
sfs.fit(X_train_std, y_train)
sfs.get_support()

array([False, False, False, False, False, False,  True, False, False,
        True,  True, False, False])

In [None]:
# 特徵選取名稱
column_list = np.array(X.columns.to_list())
column_list[sfs.get_support()]

array(['flavanoids', 'color_intensity', 'hue'], dtype='<U28')

In [None]:
# 特徵選取名稱
sfs.get_feature_names_out(column_list)

array(['flavanoids', 'color_intensity', 'hue'], dtype=object)

In [None]:
# 特徵選取後的 X
sfs.transform(X_train_std).shape

(89, 3)

## 選擇演算法

In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()

## 模型訓練

In [None]:
clf.fit(sfs.transform(X_train_std), y_train)

## 模型評估

In [None]:
clf.score(sfs.transform(X_test_std), y_test)

0.8539325842696629

## 模型簡化，相對準確率降低