利用sklearn中的feature_extraction进行OneHot向量化<br>
OneHot向量化基于词袋模型

In [1]:
from sklearn.feature_extraction.text import CountVectorizer

#### 英文文本的处理

In [5]:
"""
CountVectorizer(input='content', encoding='utf-8',
                 decode_error='strict', strip_accents=None,
                 lowercase=True, preprocessor=None, tokenizer=None,
                 stop_words=None, token_pattern=r"(?u)\b\w\w+\b",
                 ngram_range=(1, 1), analyzer='word',
                 max_df=1.0, min_df=1, max_features=None,
                 vocabulary=None, binary=False, dtype=np.int64)
常用参数说明：
input：序列对象，序列元素可以为文件地址、文件指针或文本
lowercase: 对于英文，默认先进行lower处理
tokenizer: 分词器，None或callable对象，在analyzer='word'时可覆写
stop_words: 停用词
token_pattern: 分词正则表达式，作用和分词器一样, 默认为r"(?u)\b\w\w+\b"，即至少两个字符
ngram_range: 采用n-gram模型的范围
analyzer: word级别或char级别, 具体有 单词为单位的word， 字符为单位的char， 以及先预先进行word再在word范围内进行char级别的n-gram划分
max_df: 最大文档频率， 若vocabulary=None忽略该参数
min_df: 最小文档频率， 若vocabulary=None忽略该参数
max_features：最大特征个数
vocabulary：可指定词典{word: index}, 从而只对感兴趣的词语进行向量化，默认对所有的input内容进行向量化
binary: 默认False, 即count会累计计数；若为True，出现则为1，不计具体count
"""
corpus = ["I come to China to travel",
    "This is a car polupar in China",
    "I love tea and Apple ",
    "The work is to write some papers in science"]


vector = CountVectorizer()
result = vector.fit_transform(corpus)

In [17]:
# 特征名称，即word. 这是vector对象的方法
print(vector.get_feature_names())

features_list = []
for i, j in enumerate(vector.get_feature_names()):
    features_list.append((i,j))
print(features_list)

['and', 'apple', 'car', 'china', 'come', 'in', 'is', 'love', 'papers', 'polupar', 'science', 'some', 'tea', 'the', 'this', 'to', 'travel', 'work', 'write']
[(0, 'and'), (1, 'apple'), (2, 'car'), (3, 'china'), (4, 'come'), (5, 'in'), (6, 'is'), (7, 'love'), (8, 'papers'), (9, 'polupar'), (10, 'science'), (11, 'some'), (12, 'tea'), (13, 'the'), (14, 'this'), (15, 'to'), (16, 'travel'), (17, 'work'), (18, 'write')]


In [15]:
# reulst的数据类型为scipy.sparse.csr.csr_matrix， 每个元素为   (document_id, word_id)  count
# 因为token_pattern默认至少两个字符，所以I、a忽略
print(result)

  (0, 16)	1
  (0, 3)	1
  (0, 15)	2
  (0, 4)	1
  (1, 5)	1
  (1, 9)	1
  (1, 2)	1
  (1, 6)	1
  (1, 14)	1
  (1, 3)	1
  (2, 1)	1
  (2, 0)	1
  (2, 12)	1
  (2, 7)	1
  (3, 10)	1
  (3, 8)	1
  (3, 11)	1
  (3, 18)	1
  (3, 17)	1
  (3, 13)	1
  (3, 5)	1
  (3, 6)	1
  (3, 15)	1


In [18]:
# 矩阵
print(result.toarray())

[[0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 2 1 0 0]
 [0 0 1 1 0 1 1 0 0 1 0 0 0 0 1 0 0 0 0]
 [1 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0]
 [0 0 0 0 0 1 1 0 1 0 1 1 0 1 0 1 0 1 1]]


In [43]:
# ngram_range影响统计的n_gram
vector1 = CountVectorizer(analyzer='word', ngram_range=(1,2))   # 会考虑连续两个词的特征
result1 = vector1.fit_transform(corpus)

features_list1 = []
for i, j in enumerate(vector1.get_feature_names()):
    features_list1.append((i,j))
print(features_list1)

[(0, 'and'), (1, 'and apple'), (2, 'apple'), (3, 'car'), (4, 'car polupar'), (5, 'china'), (6, 'china to'), (7, 'come'), (8, 'come to'), (9, 'in'), (10, 'in china'), (11, 'in science'), (12, 'is'), (13, 'is car'), (14, 'is to'), (15, 'love'), (16, 'love tea'), (17, 'papers'), (18, 'papers in'), (19, 'polupar'), (20, 'polupar in'), (21, 'science'), (22, 'some'), (23, 'some papers'), (24, 'tea'), (25, 'tea and'), (26, 'the'), (27, 'the work'), (28, 'this'), (29, 'this is'), (30, 'to'), (31, 'to china'), (32, 'to travel'), (33, 'to write'), (34, 'travel'), (35, 'work'), (36, 'work is'), (37, 'write'), (38, 'write some')]


In [36]:
result1.toarray()

array([[0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 1, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0,
        0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
        0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1,
        1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1]], dtype=int64)

In [20]:
# max_features参数的影响, 会对one-hot维度进行截断
vector2 = CountVectorizer(max_features=10)
result2 = vector2.fit_transform(corpus)

features_list = []
for i, j in enumerate(vector2.get_feature_names()):
    features_list.append((i,j))
print(features_list)

[(0, 'and'), (1, 'china'), (2, 'in'), (3, 'is'), (4, 'some'), (5, 'tea'), (6, 'the'), (7, 'this'), (8, 'to'), (9, 'travel')]


In [21]:
print(result2)

  (0, 9)	1
  (0, 1)	1
  (0, 8)	2
  (1, 2)	1
  (1, 3)	1
  (1, 7)	1
  (1, 1)	1
  (2, 0)	1
  (2, 5)	1
  (3, 4)	1
  (3, 6)	1
  (3, 2)	1
  (3, 3)	1
  (3, 8)	1


In [22]:
# 结果的array只有10维
print(result2.toarray())

[[0 1 0 0 0 0 0 0 2 1]
 [0 1 1 1 0 0 0 1 0 0]
 [1 0 0 0 0 1 0 0 0 0]
 [0 0 1 1 1 0 1 0 1 0]]


#### 中文文本的处理

中文相较于英文的一个显著区别在于：英文通过空格自动分词（如token_pattern=r"(?u)\b\w\w+\b"所示），所以在使用sklearn进行向量化的时候要先进行中文的分词

In [23]:
import jieba
from sklearn.feature_extraction.text import CountVectorizer

In [24]:
corpus_cn = ["今天天气不错",
    "最近工作非常忙",
    "中文自然语言处理非常男难",
    "今天是星期二"]

def word_segment(sent):
    return " ".join(jieba.cut(sent))

corpus_cn_seg = list(map(word_segment, corpus_cn))
print(corpus_cn_seg)

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/rk/nnl9yhm55kb6325ckffkn6hw0000gn/T/jieba.cache
Loading model cost 0.680 seconds.
Prefix dict has been built succesfully.


['今天天气 不错', '最近 工作 非常 忙', '中文 自然语言 处理 非常 男难', '今天 是 星期二']


In [25]:
vector_cn = CountVectorizer(token_pattern=r"(?u)\b\w+\b")   # 改写token_pattern以匹配一个字符
result_cn = vector_cn.fit_transform(corpus_cn_seg)

In [26]:
print(vector_cn.get_feature_names())

['不错', '中文', '今天', '今天天气', '处理', '工作', '忙', '星期二', '是', '最近', '男难', '自然语言', '非常']


In [27]:
print(result_cn)

  (0, 0)	1
  (0, 3)	1
  (1, 6)	1
  (1, 12)	1
  (1, 5)	1
  (1, 9)	1
  (2, 10)	1
  (2, 4)	1
  (2, 11)	1
  (2, 1)	1
  (2, 12)	1
  (3, 7)	1
  (3, 8)	1
  (3, 2)	1


In [28]:
print(result_cn.toarray())

[[1 0 0 1 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 1 0 0 1 0 0 1]
 [0 1 0 0 1 0 0 0 0 0 1 1 1]
 [0 0 1 0 0 0 0 1 1 0 0 0 0]]


**利用NLTK的FreqDist统计下词频**

In [31]:
import nltk

corpus_list = []
[corpus_list.extend(sent.split()) for sent in corpus]
print(corpus_list)

['I', 'come', 'to', 'China', 'to', 'travel', 'This', 'is', 'a', 'car', 'polupar', 'in', 'China', 'I', 'love', 'tea', 'and', 'Apple', 'The', 'work', 'is', 'to', 'write', 'some', 'papers', 'in', 'science']


In [34]:
sorted(dict(nltk.FreqDist(corpus_list)).items(), key=lambda x:x[1], reverse=True)

[('to', 3),
 ('I', 2),
 ('China', 2),
 ('is', 2),
 ('in', 2),
 ('come', 1),
 ('travel', 1),
 ('This', 1),
 ('a', 1),
 ('car', 1),
 ('polupar', 1),
 ('love', 1),
 ('tea', 1),
 ('and', 1),
 ('Apple', 1),
 ('The', 1),
 ('work', 1),
 ('write', 1),
 ('some', 1),
 ('papers', 1),
 ('science', 1)]