# Feature Selection 特征选择

特征选择是从原始数据中选择对于预测流水线而言最好的特征的过程。

## 模型选择

### 定义一个模型评估和选择函数

使用 sklearn 的网格搜索模块。

In [1]:
from sklearn.model_selection import GridSearchCV

def get_best_model_and_accuracy(model, params, x, y):
    # model 要搜索的模型
    # params 要尝试的参数
    grid = GridSearchCV(model, 
                        params,
                        error_score=0.,
                        cv=5)
    grid.fit(x,y)
    
    # 经典的性能指标
    print("Best Accuracy:{}".format(grid.best_score_))
    # 得到最佳准确率的最佳参数
    print("Best Parameters:{}".format(grid.best_params_))
    
    # 拟合的平均时间
    print("Average Time to Fit (s):{}".format(round(grid.cv_results_['mean_fit_time'].mean(), 3)))
    # 预测的平均时间
    print("Average Time to Score (s):{}".format(round(grid.cv_results_['mean_score_time'].mean(), 3)))

### 准备数据集

信用卡逾期数据集

In [2]:
import pandas as pd

credit_card_default = pd.read_csv('./data/credit_card_default.csv')
credit_card_default.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,20000,2,2,1,24,2,2,-1,-1,-2,...,0,0,0,0,689,0,0,0,0,1
1,120000,2,2,2,26,-1,2,0,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,90000,2,2,2,34,0,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,50000,2,2,1,37,0,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,50000,1,2,1,57,-1,0,-1,0,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [3]:
# 检查一下数据集的大小
credit_card_default.shape

(30000, 24)

In [4]:
# 描述性统计
credit_card_default.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
LIMIT_BAL,30000.0,167484.322667,129747.661567,10000.0,50000.0,140000.0,240000.0,1000000.0
SEX,30000.0,1.603733,0.489129,1.0,1.0,2.0,2.0,2.0
EDUCATION,30000.0,1.853133,0.790349,0.0,1.0,2.0,2.0,6.0
MARRIAGE,30000.0,1.551867,0.52197,0.0,1.0,2.0,2.0,3.0
AGE,30000.0,35.4855,9.217904,21.0,28.0,34.0,41.0,79.0
PAY_0,30000.0,-0.0167,1.123802,-2.0,-1.0,0.0,0.0,8.0
PAY_2,30000.0,-0.133767,1.197186,-2.0,-1.0,0.0,0.0,8.0
PAY_3,30000.0,-0.1662,1.196868,-2.0,-1.0,0.0,0.0,8.0
PAY_4,30000.0,-0.220667,1.169139,-2.0,-1.0,0.0,0.0,8.0
PAY_5,30000.0,-0.2662,1.133187,-2.0,-1.0,0.0,0.0,8.0


In [5]:
# 检查缺失值; 这个例子不存在缺失值
credit_card_default.isnull().sum()

LIMIT_BAL                     0
SEX                           0
EDUCATION                     0
MARRIAGE                      0
AGE                           0
PAY_0                         0
PAY_2                         0
PAY_3                         0
PAY_4                         0
PAY_5                         0
PAY_6                         0
BILL_AMT1                     0
BILL_AMT2                     0
BILL_AMT3                     0
BILL_AMT4                     0
BILL_AMT5                     0
BILL_AMT6                     0
PAY_AMT1                      0
PAY_AMT2                      0
PAY_AMT3                      0
PAY_AMT4                      0
PAY_AMT5                      0
PAY_AMT6                      0
default payment next month    0
dtype: int64

In [6]:
X = credit_card_default.drop('default payment next month', axis=1)
y = credit_card_default['default payment next month']

In [7]:
# 取基准空准确度
y.value_counts(normalize=True)

0    0.7788
1    0.2212
Name: default payment next month, dtype: float64

### 创建基准机器学习流水线

这里我们评估下面 4 种模型：
- 逻辑回归
- K 最近邻
- 决策树
- 随机森林

In [8]:
from sklearn.linear_model import LogisticRegression # 逻辑回归
from sklearn.neighbors import KNeighborsClassifier # KNN
from sklearn.tree import DecisionTreeClassifier # 决策树
from sklearn.ensemble import RandomForestClassifier # 随机森林

In [9]:
# 为网格搜索设置变量

# 逻辑回归
lr_params = {'C':[1e-1, 1e0, 1e1, 1e2], 'penalty':['l1','l2']}

# KNN
knn_params = {'n_neighbors': [1, 3, 5, 7]}

# 决策树
tree_params = {'max_depth':[None, 1, 3, 5, 7, 9]}

# 随机森林
rf_params = {'n_estimators':[10, 50, 100], 'max_depth':[None, 1, 3, 5, 7, 9]}

In [10]:
# 实例化这些机器学习模型
lr = LogisticRegression(solver='liblinear')
knn = KNeighborsClassifier()
d_tree = DecisionTreeClassifier()
rf = RandomForestClassifier()

#### 逻辑回归

In [11]:
get_best_model_and_accuracy(lr, lr_params, X, y)

Best Accuracy:0.8099
Best Parameters:{'C': 0.1, 'penalty': 'l1'}
Average Time to Fit (s):0.862
Average Time to Score (s):0.003


#### KNN
KNN 是基于距离的模型，为了更准确地评估基准性能，我们需要更复杂的流水线

In [12]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# 为流水线设置 KNN 参数
knn_pipe_params = {'classifier__{}'.format(k): v for k, v in knn_params.items()}

# KNN 需要标准化的参数
knn_pipe = Pipeline([('scale', StandardScaler()), ('classifier', knn)])

get_best_model_and_accuracy(knn_pipe, knn_pipe_params, X, y)

Best Accuracy:0.7994
Best Parameters:{'classifier__n_neighbors': 7}
Average Time to Fit (s):0.076
Average Time to Score (s):5.674


#### 决策树

In [13]:
get_best_model_and_accuracy(d_tree, tree_params, X, y)

Best Accuracy:0.8206333333333333
Best Parameters:{'max_depth': 3}
Average Time to Fit (s):0.233
Average Time to Score (s):0.002


#### 随机森林

In [14]:
get_best_model_and_accuracy(rf, rf_params, X, y)

Best Accuracy:0.8197
Best Parameters:{'max_depth': 7, 'n_estimators': 100}
Average Time to Fit (s):1.509
Average Time to Score (s):0.039


通过上面的结果，考虑到准确性和预测时间，决策树可能是最适合下一步采用的模型。

## 特征选择

### 基于统计的特征选择

#### 皮尔逊相关系数 

皮尔逊相关系数会测量列之间的线性关系。该系数在 -1~1 变化，0 代表没有线性关系，相关性接近 -1 或 1 代表线性关系很强。

pandas 的 corr() 方法会为所有的列计算皮尔逊相关系数。

In [15]:
from sklearn.base import TransformerMixin, BaseEstimator

# 定义一个应用皮尔逊相关系数的特征选择器
class CustomCorrelationChooser(TransformerMixin, BaseEstimator):
    def __init__(self, response, cols_to_keep =[], threshold=None):
        # 保存 target
        self.response = response
        # 保存阈值
        self.threshold = threshold
        # 初始化一个变量用于保存特征名
        self.cols_to_keep = cols_to_keep
        
    def transform(self, X):
        """
        对数据集取子集，只包含重要的列
        """
        # 转换会选择合适的列
        return X[self.cols_to_keep]
    
    def fit(self, X, *_):
        """
        从特征矩阵中选择相关性高于阈值的列
        """
        # 创建新的 DataFrame，存放特征和 target
        df = pd.concat([X, self.response], axis=1)
        # 保存高于阈值的列的名称
        self.cols_to_keep = df.columns[df.corr()[df.columns[-1]].abs()>self.threshold]
        # 只保留 X 的列，删掉响应变量
        self.cols_to_keep = [c for c in self.cols_to_keep if c in X.columns]
        return self

In [16]:
# 实例化特征选择器
ccc = CustomCorrelationChooser(threshold=.2, response=y)
ccc.fit(X)

CustomCorrelationChooser(cols_to_keep=['PAY_0', 'PAY_2', 'PAY_3', 'PAY_4',
                                       'PAY_5'],
                         response=0        1
1        1
2        0
3        0
4        0
        ..
29995    0
29996    0
29997    1
29998    1
29999    1
Name: default payment next month, Length: 30000, dtype: int64,
                         threshold=0.2)

In [17]:
# 查看保留的变量
ccc.cols_to_keep

['PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5']

In [18]:
# 查看转换的变量
ccc.transform(X).head()

Unnamed: 0,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5
0,2,2,-1,-1,-2
1,-1,2,0,0,0
2,0,0,0,0,0
3,0,0,0,0,0
4,-1,0,-1,0,0


将特征选择器应用到我们之前的决策树模型（流水线）中：

In [19]:
from copy import deepcopy

# 初始化特征选择器
ccc = CustomCorrelationChooser(response=y)

# 创建流水线，包括选择器
ccc_pipe = Pipeline([('correlation_select', ccc),
                    ('classifier', d_tree)])

tree_pipe_params = {'classifier__max_depth': [None, 1, 3, 5, 7, 9, 11, 13, 17, 19, 21]}

# 复制之前定义的决策树参数过来
ccc_pipe_params = deepcopy(tree_pipe_params)

# 更新决策树的参数选择；我们增加一个我们的特征选择器的 threshold 参数
ccc_pipe_params.update({'correlation_select__threshold': [0, .1, .2, .3]})

get_best_model_and_accuracy(ccc_pipe, ccc_pipe_params, X, y)

Best Accuracy:0.8207
Best Parameters:{'classifier__max_depth': 3, 'correlation_select__threshold': 0.2}
Average Time to Fit (s):0.16
Average Time to Score (s):0.003


看看选择了哪些特征

In [20]:
ccc = CustomCorrelationChooser(threshold=0.2, response=y)
ccc.fit(X)

ccc.cols_to_keep

['PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5']

#### 假设检验

在特征选择中，假设检验的零假设为“特征与响应变量没有关系”。

p 值是介于 0 和 1 的小数，p 值越低，拒绝零假设的概率越大。所以这里，p 值越低，特征与响应（target）变量有关联的概率越大。

下面示例保留 p 值小于 0.05 的。

In [21]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

In [22]:
k_best = SelectKBest(f_classif, k=5)
k_best.fit_transform(X, y)

# 查看挑选的变量
p_values = pd.DataFrame({'column':X.columns,
                         'p_value':k_best.pvalues_}).sort_values('p_value')

p_values[p_values['p_value']<.05]

Unnamed: 0,column,p_value
5,PAY_0,0.0
6,PAY_2,0.0
7,PAY_3,0.0
8,PAY_4,1.899297e-315
9,PAY_5,1.126608e-279
10,PAY_6,7.29674e-234
0,LIMIT_BAL,1.302244e-157
17,PAY_AMT1,1.146488e-36
18,PAY_AMT2,3.166657e-24
20,PAY_AMT4,6.830942000000001e-23


将 SelectKBest 应用到我们的流水线中：

In [23]:
k_best = SelectKBest(f_classif)

# 建立流水线
select_k_pipe = Pipeline([('k_best', k_best),
                    ('classifier', d_tree)])

select_k_best_pipe_params = deepcopy(tree_pipe_params)

select_k_best_pipe_params.update({'k_best__k': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 'all'],
                                 'classifier__max_depth': [None, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21]})

get_best_model_and_accuracy(select_k_pipe, select_k_best_pipe_params, X, y)

Best Accuracy:0.8213333333333334
Best Parameters:{'classifier__max_depth': 3, 'k_best__k': 5}
Average Time to Fit (s):0.147
Average Time to Score (s):0.003


看看选择了哪些特征

In [24]:
k_best = SelectKBest(f_classif, k=5)

In [25]:
p_values.head(5)

Unnamed: 0,column,p_value
5,PAY_0,0.0
6,PAY_2,0.0
7,PAY_3,0.0
8,PAY_4,1.899297e-315
9,PAY_5,1.126608e-279
10,PAY_6,7.29674e-234
0,LIMIT_BAL,1.302244e-157


### 基于模型的特征选择

#### SelectFromModel

和 SelectKBest 类似，但不是统计测试

In [26]:
from sklearn.feature_selection import SelectFromModel

# 实例化一个类，按照决策树分类器的内部指标排序重要性，选择特征
select_from_model = SelectFromModel(DecisionTreeClassifier(), threshold=.05)

# 拟合数据
selected_X = select_from_model.fit_transform(X, y)
selected_X.shape

(30000, 7)