In [1]:
from sklearn import feature_selection as fs
from sklearn import datasets

# Removing features with low variance

**設定變異數門檻值，剔除變異過低的特徵**

$$Var(X)=E[(X-\mu)^2]$$

Bernoulli distribution: $$ Var(X)=p(1-P)$$

In [2]:
X=[[0, 0, 1],
   [0, 1, 0],
   [1, 0, 0], 
   [0, 1, 1], 
   [0, 1, 0],
   [0, 1, 1]]

假設 80% 的樣本具有值 1，20% 的樣本具有值 0

這個假設的目的在於根據經驗或數據特徵來設置適合的閾值

In [3]:
sel=fs.VarianceThreshold(threshold=0.8*(1-0.8))
sel.fit_transform(X)

array([[0, 1],
       [1, 0],
       [0, 0],
       [1, 1],
       [1, 0],
       [1, 1]])

# Univariate feature selection

**透過單獨計算每個特徵的統計值來決定重要特徵**
1. SelectKBest: 選取排名前K個重要特徵
2. SelectPercentile: 選取排名前K%的重要特徵

For regression:
- f_regression, mutual_info_regression

For classification:
- chi2, f_classif, mutual_info_classif

In [4]:
import numpy as np
from sklearn import datasets
from sklearn import feature_selection as fs

# 載入Iris數據集
iris = datasets.load_iris()
X = iris.data
y = iris.target

# 使用chi2統計量來選擇k個最佳特徵
# chi2適用於非負數據，常用於分類問題
select_k_best_chi2 = fs.SelectKBest(fs.chi2, k=3)
X_new_chi2 = select_k_best_chi2.fit_transform(X, y)

# 顯示選擇後的特徵
print("使用chi2選擇的特徵數量：",X_new_chi2.shape)
print("使用chi2選擇的特徵：")
print(X_new_chi2)

使用chi2選擇的特徵數量： (150, 3)
使用chi2選擇的特徵：
[[5.1 1.4 0.2]
 [4.9 1.4 0.2]
 [4.7 1.3 0.2]
 [4.6 1.5 0.2]
 [5.  1.4 0.2]
 [5.4 1.7 0.4]
 [4.6 1.4 0.3]
 [5.  1.5 0.2]
 [4.4 1.4 0.2]
 [4.9 1.5 0.1]
 [5.4 1.5 0.2]
 [4.8 1.6 0.2]
 [4.8 1.4 0.1]
 [4.3 1.1 0.1]
 [5.8 1.2 0.2]
 [5.7 1.5 0.4]
 [5.4 1.3 0.4]
 [5.1 1.4 0.3]
 [5.7 1.7 0.3]
 [5.1 1.5 0.3]
 [5.4 1.7 0.2]
 [5.1 1.5 0.4]
 [4.6 1.  0.2]
 [5.1 1.7 0.5]
 [4.8 1.9 0.2]
 [5.  1.6 0.2]
 [5.  1.6 0.4]
 [5.2 1.5 0.2]
 [5.2 1.4 0.2]
 [4.7 1.6 0.2]
 [4.8 1.6 0.2]
 [5.4 1.5 0.4]
 [5.2 1.5 0.1]
 [5.5 1.4 0.2]
 [4.9 1.5 0.2]
 [5.  1.2 0.2]
 [5.5 1.3 0.2]
 [4.9 1.4 0.1]
 [4.4 1.3 0.2]
 [5.1 1.5 0.2]
 [5.  1.3 0.3]
 [4.5 1.3 0.3]
 [4.4 1.3 0.2]
 [5.  1.6 0.6]
 [5.1 1.9 0.4]
 [4.8 1.4 0.3]
 [5.1 1.6 0.2]
 [4.6 1.4 0.2]
 [5.3 1.5 0.2]
 [5.  1.4 0.2]
 [7.  4.7 1.4]
 [6.4 4.5 1.5]
 [6.9 4.9 1.5]
 [5.5 4.  1.3]
 [6.5 4.6 1.5]
 [5.7 4.5 1.3]
 [6.3 4.7 1.6]
 [4.9 3.3 1. ]
 [6.6 4.6 1.3]
 [5.2 3.9 1.4]
 [5.  3.5 1. ]
 [5.9 4.2 1.5]
 [6.  4.  1. ]
 [6.1 4.7 1.4]
 [5

In [5]:
# 使用mutual_info_classif統計量來選擇前50%最佳特徵
select_k_best_mutual_info_classif = fs.SelectPercentile(fs.mutual_info_classif, percentile=50)
X_new_mutual_info_classif = select_k_best_mutual_info_classif.fit_transform(X, y)

# 顯示選擇後的特徵數量和特徵值
print("使用mutual_info_classif選擇的特徵數量：", X_new_mutual_info_classif.shape)
print("使用mutual_info_classif選擇的特徵：")
print(X_new_mutual_info_classif)

使用mutual_info_classif選擇的特徵數量： (150, 2)
使用mutual_info_classif選擇的特徵：
[[1.4 0.2]
 [1.4 0.2]
 [1.3 0.2]
 [1.5 0.2]
 [1.4 0.2]
 [1.7 0.4]
 [1.4 0.3]
 [1.5 0.2]
 [1.4 0.2]
 [1.5 0.1]
 [1.5 0.2]
 [1.6 0.2]
 [1.4 0.1]
 [1.1 0.1]
 [1.2 0.2]
 [1.5 0.4]
 [1.3 0.4]
 [1.4 0.3]
 [1.7 0.3]
 [1.5 0.3]
 [1.7 0.2]
 [1.5 0.4]
 [1.  0.2]
 [1.7 0.5]
 [1.9 0.2]
 [1.6 0.2]
 [1.6 0.4]
 [1.5 0.2]
 [1.4 0.2]
 [1.6 0.2]
 [1.6 0.2]
 [1.5 0.4]
 [1.5 0.1]
 [1.4 0.2]
 [1.5 0.2]
 [1.2 0.2]
 [1.3 0.2]
 [1.4 0.1]
 [1.3 0.2]
 [1.5 0.2]
 [1.3 0.3]
 [1.3 0.3]
 [1.3 0.2]
 [1.6 0.6]
 [1.9 0.4]
 [1.4 0.3]
 [1.6 0.2]
 [1.4 0.2]
 [1.5 0.2]
 [1.4 0.2]
 [4.7 1.4]
 [4.5 1.5]
 [4.9 1.5]
 [4.  1.3]
 [4.6 1.5]
 [4.5 1.3]
 [4.7 1.6]
 [3.3 1. ]
 [4.6 1.3]
 [3.9 1.4]
 [3.5 1. ]
 [4.2 1.5]
 [4.  1. ]
 [4.7 1.4]
 [3.6 1.3]
 [4.4 1.4]
 [4.5 1.5]
 [4.1 1. ]
 [4.5 1.5]
 [3.9 1.1]
 [4.8 1.8]
 [4.  1.3]
 [4.9 1.5]
 [4.7 1.2]
 [4.3 1.3]
 [4.4 1.4]
 [4.8 1.4]
 [5.  1.7]
 [4.5 1.5]
 [3.5 1. ]
 [3.8 1.1]
 [3.7 1. ]
 [3.9 1.2]
 [5.1 1.6]
 [4.5 1.5