In [7]:
from sklearn.feature_extraction.text import CountVectorizer

### n元语法基本介绍(基于马尔科夫假设)

假设语句分词后为:['我', '司', '可', '办理', '正规发票']

#### 一元语法(词语之间条件独立)

$$
\begin{aligned}
& \quad P("我", "司", "可", "办理", "正规发票", "保真", "增值税", "发票", "点数", "优惠")|S)\\
&=P("我"|S) \times P("司"|S) \times P("可"|S) \times P("办理"|S) \times P("正规发票"|S) \times P("保真"|S) \times P("增值税"|S) \times P("发票"|S) \times P("点数"|S) \times P("优惠"|S)
\end{aligned}
$$

#### 二元语法bigram(一个词语依赖上一个词语)
$$
\begin{aligned}
& \quad P("我", "司", "可", "办理", "正规发票", "保真", "增值税", "发票", "点数", "优惠")|S)\\
&=P("我"|S) \times P("司"|S,"我") \times P("可"|S，"司") \times P("办理"|S，"可”) \cdots  \times P("优惠"|S, "点数")
\end{aligned}
$$

#### 三元语法trigram(一个词语依赖上两个词语)
$$
\begin{aligned}
& \quad P("我", "司", "可", "办理", "正规发票", "保真", "增值税", "发票", "点数", "优惠")|S)\\
&=P("我"|S) \times P("司"|S,"我") \times P("可"|S，"我", "司") \times P("办理"|S，"司", "可”) \cdots  \times P("优惠"|S, "发票", "点数")
\end{aligned}
$$

若$n$过大,则会造成：
1. 参数空间过大
2. 数据稀疏严重

&emsp;&emsp;有一些词或词组在语料中没有出现过,但这不能代表它不可能存在.
平滑操作就是给那些没有出现过的词或词组也给一个比较小的概率,常见的平滑操作有拉普拉斯平滑(详情见:概率图模型/朴素贝叶斯/基本方法_help.ipynb)

In [8]:
with open('data.txt', 'r', encoding='utf-8') as f:
    a = f.readlines()

# ngram_range=(1, 1) -->一元语法
# ngram_range(1, 2) -->一元语法，二元语法
# ngram_range(2, 3) -->二元语法，三元语法
vectorizer = CountVectorizer(ngram_range=(1, 3))  # 1元语法,二元语法,三元语法

In [9]:
X = vectorizer.fit_transform(a)
type(X)  # 稀疏矩阵

scipy.sparse.csr.csr_matrix

In [10]:
X.toarray().shape

(14, 825)

In [11]:
vectorizer.get_feature_names()

['advance',
 'advance of',
 'advance of the',
 'after',
 'after just',
 'after just rained',
 'again',
 'again stood',
 'again stood on',
 'age',
 'age has',
 'age has reached',
 'age will',
 'age will be',
 'air',
 'air is',
 'air is fresh',
 'already',
 'already can',
 'already can remember',
 'also',
 'also once',
 'also once seemed',
 'alumni',
 'alumni in',
 'alumni in peace',
 'always',
 'always feel',
 'always feel oneself',
 'always special',
 'always special to',
 'an',
 'an arm',
 'an arm injection',
 'and',
 'and more',
 'and more found',
 'and new',
 'and new clothes',
 'appearance',
 'appearance of',
 'appearance of the',
 'are',
 'are always',
 'are always special',
 'are lot',
 'are lot of',
 'are necessary',
 'are necessary for',
 'are optional',
 'arm',
 'arm injection',
 'arm injection or',
 'as',
 'as if',
 'as if nothing',
 'as the',
 'as the advance',
 'at',
 'at his',
 'at his thrown',
 'at last',
 'at last dislike',
 'at the',
 'at the beginning',
 'at the moment

In [12]:
vectorizer.vocabulary_




{'age': 9,
 'has': 259,
 'reached': 552,
 'the': 650,
 'end': 195,
 'of': 454,
 'beginning': 93,
 'word': 798,
 'age has': 10,
 'has reached': 264,
 'reached the': 553,
 'the end': 665,
 'end of': 198,
 'of the': 472,
 'the beginning': 659,
 'beginning of': 94,
 'of word': 478,
 'age has reached': 11,
 'has reached the': 265,
 'reached the end': 554,
 'the end of': 666,
 'end of the': 199,
 'of the beginning': 474,
 'the beginning of': 660,
 'beginning of word': 96,
 'may': 404,
 'be': 75,
 'guilty': 256,
 'in': 311,
 'his': 291,
 'seems': 580,
 'to': 733,
 'passing': 521,
 'lot': 393,
 'different': 167,
 'life': 377,
 'became': 86,
 'appearance': 39,
 'same': 568,
 'day': 150,
 'may be': 405,
 'be guilty': 80,
 'guilty in': 257,
 'in his': 314,
 'his seems': 295,
 'seems to': 583,
 'to passing': 748,
 'passing lot': 522,
 'lot of': 394,
 'of different': 461,
 'different life': 168,
 'life became': 378,
 'became the': 87,
 'the appearance': 657,
 'appearance of': 40,
 'the same': 682,
