# 此脚本用于学习研究特征工程相关知识

* 特征工程基本包括三个部分：特征提取、特征创造、特征选取
* 此脚本主要研究的是特征选取部分：过滤法、嵌入法、包装法、降维运算（下个脚本）

## 一、过滤法特征选取

 ### 1.1方差过滤 sklearn.feature_selection .VarianceThreshold

* 如果一个特征本身，方差很小，表示在这个特征上区分度不高。我们首要是要消除方差为0或极小的特征
* 适用于需要遍历特征或升维的算法

In [31]:
from sklearn.feature_selection import VarianceThreshold
import pandas as pd
import numpy as np
data = pd.read_csv(r'C:\Users\Mypc\Desktop\菜菜\源文件\03-数据预处理与特征工程\digit recognizor.csv',index_col=0)
data.shape

(42000, 784)

In [32]:
selector = VarianceThreshold() #默认方差为0
data_var_0 = selector.fit_transform(data)
data_var_0.shape  #可见，剔除方差为0的特征后，该样本特征数从784-->708 

(42000, 708)

In [44]:
# 假设我需要剔除一般的样本特征数，可使用中位数来做为方差阈值
data_var_median = VarianceThreshold(np.median(data.var().values)).fit_transform(data)
data_var_median.shape 

(42000, 392)

In [16]:
#KNN vs 随机森林在不同方差过滤效果下的对比
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.model_selection import cross_val_score

In [61]:
x = data.iloc[:,1:]
y = data.iloc[:,0]

x_var_median =  VarianceThreshold(np.median(x.var().values)).fit_transform(x)

TypeError: 'tuple' object is not callable

In [18]:
# before filtering
#======【TIME WARNING：35mins +】======#
cross_val_score(KNN(),x,y,cv=5).mean()


KeyboardInterrupt: 

In [None]:
#python中的魔法命令，可以直接使用%%timeit来计算运行这个cell中的代码所需的时间
#为了计算所需的时间，需要将这个cell中的代码运行很多次（通常是7次）后求平均值，因此运行%%timeit的时间会/远远超过cell中的代码单独运行的时间
#======【TIME WARNING：4 hours】======#
%%timeit
cross_val_score(KNN(),X,y,cv=5).mean()

In [None]:
# after filtered
#======【TIME WARNING：20 mins+】======#
cross_val_score(KNN(),x_var_median,y,cv=5).mean()

#======【TIME WARNING：2 hours】======#
%%timeit
cross_val_score(KNN(),x_var_median,y,cv=5).mean()

In [45]:
# before filtering
cross_val_score(RFC(n_estimators=10,random_state=0),x,y,cv=5).mean()

1.0

In [20]:
# after filtered
cross_val_score(RFC(n_estimators=10,random_state=0),x_var_median,y,cv=5).mean()

1.0

#### 3.2相关性过滤

* 卡方过滤

In [39]:
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.model_selection import cross_val_score #
from sklearn.feature_selection import SelectKBest #feature_selection.SelectKBest这个可以输入”评分标准“来选出前K个分数最高的特征的类，我们可以借此除去最可能独立于标签，与我们分类目的无关的特征。
from sklearn.feature_selection import chi2 #卡方检验类feature_selection.chi2计算每个非负特征和标签之间的卡方统计量

#假设只需要300个特征
x_chi  = SelectKBest(chi2,k=300).fit_transform(x_var_median,y)
x_chi.shape

(42000, 300)

In [43]:
cross_val_score(RFC(n_estimators=2,random_state=99999),x_chi,y,cv=5).mean()#验证模型效果如何

1.0

In [29]:
##怎么高效地选择K？？
kafang_value,p_value = chi2(x_var_median,y)
kafang_value

array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, na

In [30]:
p_value

array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, na

In [35]:
k = kafang_value.shape[0] - (p_value > 0.05).sum() #只挑选p小于0.05的K个数

  """Entry point for launching an IPython kernel.


In [36]:
k

391

In [38]:
X_fschi = SelectKBest(chi2, k=391).fit_transform(x_var_median, y)
cross_val_score(RFC(n_estimators=10,random_state=0),X_fschi,y,cv=5).mean()

1.0

In [46]:
from sklearn.feature_selection import f_classif
F,pvalues_f =f_classif(x_var_median,y)

  msb = ssbn / float(dfbn)


In [47]:
F

array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, na

In [48]:
pvalues_f

array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, na

In [49]:
k = F.shape[0] - (pvalues_f > 0.05).sum()
k

  """Entry point for launching an IPython kernel.


391

In [50]:
F.shape[0]

391

In [52]:
x_fsf = SelectKBest(f_classif,k=k).fit_transform(x_var_median,y)
cross_val_score(RFC(n_estimators=10,random_state=0),x_fsf,y,cv=5).mean()

  msb = ssbn / float(dfbn)


1.0

#### 3.3互信息法

In [53]:
from sklearn.feature_selection import mutual_info_classif,mutual_info_regression
mic = mutual_info_classif(x_var_median,y)
k = mic.shape[0] - sum(mic <= 0)
k

391

In [54]:
X_fsmic= SelectKBest(mutual_info_classif,k).fit_transform(x_var_median,y)
cross_val_score(RFC(n_estimators=10,random_state=0),X_fsmic,y,cv=5).mean()

1.0

## 四、嵌入法

* 嵌入法是一种让算法自己决定使用哪些特征的方法，即特征选择和算法训练同时进行
* 在使用嵌入法时，我们先用某些机器学习的算法和模型进行训练，得到各个特征的权值系数，根据权值系数从大到小选择特征
* 无关的特征（需要相关性过滤的特征）和无区分度的特征（需要方差过滤的特征）都会因为缺乏对模型的贡献而被删除掉，可谓是过滤法的进化版

In [56]:
from sklearn.feature_selection import SelectFromModel

* SelectFromModel是一个元变换器，可以与任何在拟合后具有coef_,feature_importances_属性或 
* 参数中可选惩罚项的评估器一起使用：随机森林和树模型就具有feature_importances_属性；
* 逻辑回归就带有L1和L2惩罚项，线性支持向量机也支持L2惩罚项
* 对于有feature_importances_的模型来说，若重要性低于提供的阈值参数，则认为这些特征不重要并被移除。阈值范围：【0,1】
* 而对于使用惩罚项的模型来说，正则化惩罚项越大，特征在模型中对应的系数就会越小。当正则化惩罚项大到一定的程度的时候，部分特征系数会变成0。
* 但是我们会发现一部分特征系数会更容易先变成0，这部分系数就是可以筛掉的。也就是说，我们选择特征系数较大的特征。

In [57]:
RFC_ = RFC(n_estimators =10,random_state=0)

In [59]:
X_embedded = SelectFromModel(RFC_,threshold=0.00001).fit_transform(x,y)
X_embedded.shape



(42000, 0)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
RFC_.fit(x,y).feature_importances_
threshold = np.linspace(0,(RFC_.fit(x,y).feature_importances_).max(),20)
score = []
for i in threshold:
    X_embedded = SelectFromModel(RFC_,threshold=i).fit_transform(x,y)
    once = cross_val_score(RFC_,X_embedded,y,cv=5).mean()
    score.append(once)
plt.plot(threshold,score)
plt.show()