In [1]:
import numpy as np
import warnings

from sklearn.feature_selection import VarianceThreshold,SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression


In [2]:
X = np.array([
    [0, 2, 0, 3],
    [0, 1, 4, 3],
    [0.1, 1, 1, 3],
    [1, 2, 3, 1],
    [2, 3, 4, 3]
], dtype=np.float32)
Y = np.array([1,2,1,2,1])

## 方差选择法

In [3]:
# 基于方差选择最优的特征属性
variance = VarianceThreshold(threshold=0.6)
print(variance)
variance.fit(X)
print("各个特征属性的方差为:")
print(variance.variances_)
print('-----------------')
print(variance.transform(X))

VarianceThreshold(threshold=0.6)
各个特征属性的方差为:
[0.6176 0.56   2.64   0.64  ]
-----------------
[[0.  0.  3. ]
 [0.  4.  3. ]
 [0.1 1.  3. ]
 [1.  3.  1. ]
 [2.  4.  3. ]]


## 相关系数法

In [19]:
sk1 = SelectKBest(f_regression, k=2)
sk1.fit(X,Y)
print(sk1)
print('------------')
print(sk1.scores_)
print('------------')
print(sk1.transform(X))

SelectKBest(k=2, score_func=<function f_regression at 0x000001D8D6B19F70>)
------------
[0.04736842 0.36       1.32       1.8       ]
------------
[[0. 3.]
 [4. 3.]
 [1. 3.]
 [3. 1.]
 [4. 3.]]


## 卡方检验

In [20]:
# 使用chi2的时候要求特征属性的取值为非负数
sk2 = SelectKBest(chi2, k=2)
sk2.fit(X, Y)
print(sk2)
print(sk2.scores_)
print(sk2.transform(X))

SelectKBest(k=2, score_func=<function chi2 at 0x000001D8D6B19E50>)
[0.07741936 0.16666667 1.68055556 0.46153846]
[[0. 3.]
 [4. 3.]
 [1. 3.]
 [3. 1.]
 [4. 3.]]


## Wrapper-递归特征消除法

In [4]:
# 基于特征消去法做的特征选择
estimator = LogisticRegression()
selector = RFE(estimator,step=2,n_features_to_select=3)
selector = selector.fit(X, Y)
print(selector.support_)
print(selector.n_features_)
print(selector.ranking_)
print(selector.transform(X))

[False  True  True  True]
3
[2 1 1 1]
[[2. 0. 3.]
 [1. 4. 3.]
 [1. 1. 3.]
 [2. 3. 1.]
 [3. 4. 3.]]


## Embedded【嵌入法】-基于惩罚项的特征选择法

In [6]:
X2 = np.array([
    [ 5.1,  3.5,  1.4,  0.2],
    [ 4.9,  3. ,  1.4,  0.2],
    [ -6.2,  0.4,  5.4,  2.3],
    [ -5.9,  0. ,  5.1,  1.8]
], dtype=np.float64)
Y2 = np.array([0, 0, 2, 2])
estimator = LogisticRegression(penalty='l1', C=0.1,solver="liblinear")
sfm = SelectFromModel(estimator)
sfm.fit(X2, Y2)
print(sfm.transform(X2))
print("系数:")
print(sfm.estimator_.coef_)

[[ 5.1]
 [ 4.9]
 [-6.2]
 [-5.9]]
系数:
[[-0.03417754  0.          0.          0.        ]]


## PCA降维

In [30]:
from sklearn.decomposition import PCA
X2 = np.array([
    [ 5.1,  3.5,  1.4,  0.2, 1, 23],
    [ 4.9,  3. ,  1.4,  0.2, 2.3, 2.1],
    [ -6.2,  0.4,  5.4,  2.3, 2, 23],
    [ -5.9,  0. ,  5.1,  1.8, 2, 3]
], dtype=np.float64)
# n_components: 给定降低到多少维度，但是要求该值必须小于等于样本数目/特征数目，如果给定的值大于，那么会选择样本数目/特征数目中最小的那个作为最终的特征数目
# whiten：是否做一个白化的操作，在PCA的基础上，对于特征属性是否做一个标准化
pca = PCA(n_components=0.5,whiten=False)
pca.fit(X2)
print(pca.mean_)
print(pca.components_)
print(pca.transform(X2))

[-0.525  1.725  3.325  1.125  1.825 12.775]
[[ 0.02038178 -0.01698103 -0.01350052 -0.0149724   0.03184796 -0.99893718]]
[[-10.11606313]
 [ 10.80754053]
 [-10.34733219]
 [  9.65585479]]


## LDA降维

In [36]:
import numpy as np
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
X = np.array([
    [-1, -1, 3, 1], 
    [-2, -1, 2, 4], 
    [-3, -2, 4, 5], 
    [1, 1, 5, 4], 
    [2, 1, 6, -5], 
    [3, 2, 1, 5]])
y = np.array([1, 1, 2, 2,0, 1])
# n_components：给定降低到多少维度，要求给定的这个值和y的取值数量有关，不能超过n_class-1
clf = LinearDiscriminantAnalysis(n_components=2)
clf.fit(X, y)
print(clf.transform(X))

[[-3.2688434  -0.38911349]
 [-1.25507558 -1.78088569]
 [ 5.26064254 -0.49688862]
 [ 6.34385833  1.16134391]
 [-4.05800618  3.58297801]
 [-3.02257571 -2.07743411]]
