# 1.特征提取
 + ### 1.1 字典提取  [DictVectorizer]
  + 1.1.1 类别字段会转换成one-hot编码
  + 1.1.2 转换后获取的结果可以是二维数组，也可以是稀疏矩阵（默认）    sparse就是稀疏矩阵
  + 1.1.3 稀疏矩阵就是将二维数组中的非0数值表示出来，这样可以节省空间

In [1]:
from sklearn.feature_extraction import DictVectorizer

#1.实例一个DictVectorizer对象
tranfer = DictVectorizer(sparse=False)

In [2]:
dictDic = [{'city':'北京','wendu':1,'detail':'雾霾'},{'city':'珠海','wendu':20,'detail':'大海'},{'city':'本溪','wendu':-5,'detail':'白雪'}]

In [3]:
#2.调用提取方法 默认返回sparse矩阵
result = tranfer.fit_transform(dictDic)

In [4]:
print(result)

[[ 1.  0.  0.  0.  0.  1.  1.]
 [ 0.  0.  1.  1.  0.  0. 20.]
 [ 0.  1.  0.  0.  1.  0. -5.]]


In [27]:
print(tranfer.get_feature_names())

['city=北京', 'city=本溪', 'city=珠海', 'detail=大海', 'detail=白雪', 'detail=雾霾', 'wendu']


 + ### 1.2 文本提取  [CountVectorizer]
  + 1.2.1 英文提取+停用词
  + 1.2.1 中文提取+jieba分词

In [102]:
from sklearn.feature_extraction.text import CountVectorizer

stopword=['my']
count = CountVectorizer(stop_words=stopword)

dictCount = ['I like python,I like my mother','I dislike .net,I dislike beijing']

countResult = count.fit_transform(dictCount)
print(countResult)
print(countResult.toarray())
print(count.get_feature_names())

  (0, 3)	1
  (0, 5)	1
  (0, 2)	2
  (1, 0)	1
  (1, 4)	1
  (1, 1)	2
[[0 0 2 1 0 1]
 [1 2 0 0 1 0]]
['beijing', 'dislike', 'like', 'mother', 'net', 'python']


In [6]:
import jieba
#中文需要自己分词，然后再进行文本提取
def cuttext(text):
    return " ".join(list(jieba.cut(text)))

In [101]:
chineseCount = CountVectorizer()

dictchineseCount = ['我爱北京天安门,天安门上太阳升','伟大领袖毛主席,指引我们向前进']

temp = [] 
for s in dictchineseCount:
    temp.append(cuttext(s))
    
countchineseResult = chineseCount.fit_transform(temp)
print(countchineseResult)
print(countchineseResult.toarray())
print(chineseCount.get_feature_names())

  (0, 4)	1
  (0, 3)	2
  (0, 2)	1
  (1, 1)	1
  (1, 5)	1
  (1, 6)	1
  (1, 7)	1
  (1, 0)	1
[[0 0 1 2 1 0 0 0]
 [1 1 0 0 0 1 1 1]]
['伟大领袖', '前进', '北京', '天安门', '太阳升', '我们', '指引', '毛主席']


+ ### 1.3 文本提取  [TfidfVectorizer]

In [103]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [106]:
tf = TfidfVectorizer()

textDic = ['我爱北京天安门,天安门上太阳升','伟大领袖毛主席,指引我们向前进']

temp = [] 
for s in textDic:
    temp.append(cuttext(s))
    
result = tf.fit_transform(temp)
#print(result)
print(result.toarray())
print(tf.get_feature_names())

[[ 0.          0.          0.40824829  0.81649658  0.40824829  0.          0.
   0.        ]
 [ 0.4472136   0.4472136   0.          0.          0.          0.4472136
   0.4472136   0.4472136 ]]
['伟大领袖', '前进', '北京', '天安门', '太阳升', '我们', '指引', '毛主席']


# 2.特征预处理
 + ### 2.1 归一化
 + ### 2.2 标准化

In [1]:
from sklearn.preprocessing import MinMaxScaler,StandardScaler

In [5]:
tranfer = MinMaxScaler()

In [6]:
textDic = [[99999,15,9],
           [26,84,5],
           [42,62,1]]

In [7]:
result = tranfer.fit_transform(textDic)

In [8]:
result

array([[  1.00000000e+00,   0.00000000e+00,   1.00000000e+00],
       [  0.00000000e+00,   1.00000000e+00,   5.00000000e-01],
       [  1.60043212e-04,   6.81159420e-01,   0.00000000e+00]])

In [9]:
tranfer = StandardScaler()

result = tranfer.fit_transform(textDic)

In [10]:
result

array([[ 1.41421355, -1.34357927,  1.22474487],
       [-0.70727654,  1.05401477,  0.        ],
       [-0.70693701,  0.2895645 , -1.22474487]])

# 3.特征选择
 + ### 3.1 过滤
     + #### 3.1.1 基于方差
     + #### 3.1.2 基于相关系数

In [11]:
from sklearn.feature_selection import VarianceThreshold