# 第五章 词性（词类）和词性标注

In [1]:
%matplotlib inline
import nltk
from nltk import word_tokenizeize

## 词性和词性标注

使用词性标注器处理词的序列，为每个词附加一个词性标记

In [2]:
#无歧义的情况
text = word_tokenize("And now for something completely different")
nltk.pos_tag(text)

[('And', 'CC'),
 ('now', 'RB'),
 ('for', 'IN'),
 ('something', 'NN'),
 ('completely', 'RB'),
 ('different', 'JJ')]

In [3]:
#有歧义的情况
text = word_tokenize("They refuse to permit us to obtain the refuse permit")
nltk.pos_tag(text)

[('They', 'PRP'),
 ('refuse', 'VBP'),
 ('to', 'TO'),
 ('permit', 'VB'),
 ('us', 'PRP'),
 ('to', 'TO'),
 ('obtain', 'VB'),
 ('the', 'DT'),
 ('refuse', 'NN'),
 ('permit', 'NN')]

### 词性标注器的作用

表征词语的用法，为结构分析服务

In [6]:
text = nltk.Text(word.lower() for word in nltk.corpus.brown.words())
text.similar('woman')
#搜索woman找到名词，搜索bought 找到是动词；搜索over 会找到介词；搜索the 找到限定词

man time day year car moment world house family child country boy
state job place way war girl work word


## 表示已标注的标识符

已标注的标识符使用由标识符和标记组成的元组来表示

可以使用函数str2tuple从表示已标注的标识符的标准字符串创建这样的元组

In [8]:
tagged_token = nltk.tag.str2tuple('fly/NN')
tagged_token

('fly', 'NN')

In [9]:
tagged_token[0]

'fly'

从一个字符串构造一个已经标注标识符的链表

In [11]:
sent = "This/AT is/IN a/AT wonderful/JJ day/NN"
[nltk.tag.str2tuple(t) for t in sent.split()]

[('This', 'AT'), ('is', 'IN'), ('a', 'AT'), ('wonderful', 'JJ'), ('day', 'NN')]

## 读取标注语料库

In [12]:
nltk.corpus.brown.tagged_words()

[('The', 'AT'), ('Fulton', 'NP-TL'), ...]

In [13]:
print(nltk.corpus.nps_chat.tagged_words())

[('now', 'RB'), ('im', 'PRP'), ('left', 'VBD'), ...]


In [14]:
nltk.corpus.treebank.tagged_words()

[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ...]

In [15]:
nltk.corpus.treebank.tagged_words(tagset='universal')

[('Pierre', 'NOUN'), ('Vinken', 'NOUN'), (',', '.'), ...]

### 常见词性

In [18]:
from nltk.corpus import brown
brown_news_tagged = brown.tagged_words(categories = 'news')
tag_fd = nltk.FreqDist(tag for (word,tag) in brown_news_tagged)
tag_fd.most_common()

[('NN', 13162),
 ('IN', 10616),
 ('AT', 8893),
 ('NP', 6866),
 (',', 5133),
 ('NNS', 5066),
 ('.', 4452),
 ('JJ', 4392),
 ('CC', 2664),
 ('VBD', 2524),
 ('NN-TL', 2486),
 ('VB', 2440),
 ('VBN', 2269),
 ('RB', 2166),
 ('CD', 2020),
 ('CS', 1509),
 ('VBG', 1398),
 ('TO', 1237),
 ('PPS', 1056),
 ('PP$', 1051),
 ('MD', 1031),
 ('AP', 923),
 ('NP-TL', 741),
 ('``', 732),
 ('BEZ', 730),
 ('BEDZ', 716),
 ("''", 702),
 ('JJ-TL', 689),
 ('PPSS', 602),
 ('DT', 589),
 ('BE', 525),
 ('VBZ', 519),
 ('NR', 495),
 ('RP', 482),
 ('QL', 468),
 ('PPO', 412),
 ('WPS', 395),
 ('NNS-TL', 344),
 ('WDT', 343),
 ('BER', 328),
 ('WRB', 328),
 ('OD', 309),
 ('HVZ', 301),
 ('--', 300),
 ('NP$', 279),
 ('HV', 265),
 ('HVD', 262),
 ('*', 256),
 ('BED', 252),
 ('NPS', 215),
 ('BEN', 212),
 ('NN$', 210),
 ('DTI', 205),
 ('NP-HL', 186),
 ('ABN', 183),
 ('NN-HL', 171),
 ('IN-TL', 164),
 ('EX', 161),
 (')', 151),
 ('(', 148),
 ('JJR', 145),
 (':', 137),
 ('DTS', 136),
 ('JJT', 100),
 ('CD-TL', 96),
 ('NNS-HL', 92),
 ('

In [19]:
#画图

dict_keys(['AT', 'NP-TL', 'NN-TL', 'JJ-TL', 'VBD', 'NR', 'NN', 'IN', 'NP$', 'JJ', '``', "''", 'CS', 'DTI', 'NNS', '.', 'RBR', ',', 'WDT', 'HVD', 'VBZ', 'CC', 'IN-TL', 'BEDZ', 'VBN', 'NP', 'BEN', 'TO', 'VB', 'RB', 'DT', 'PPS', 'DOD', 'AP', 'BER', 'HV', 'DTS', 'VBG', 'PPO', 'QL', 'JJT', 'ABX', 'NN-HL', 'VBN-HL', 'WRB', 'CD', 'MD', 'BE', 'JJR', 'VBG-TL', 'BEZ', 'NN$-TL', 'HVZ', 'ABN', 'PN', 'PPSS', 'PP$', 'DO', 'NN$', 'NNS-HL', 'WPS', '*', 'EX', 'VB-HL', ':', '(', ')', 'NNS-TL', 'NPS', 'JJS', 'RP', '--', 'BED', 'OD', 'BEG', 'AT-HL', 'VBG-HL', 'AT-TL', 'PPL', 'DOZ', 'NP-HL', 'NR$', 'DOD*', 'BEDZ*', ',-HL', 'CC-TL', 'MD*', 'NNS$', 'PPSS+BER', "'", 'PPSS+BEM', 'CD-TL', 'RBT', '(-HL', ')-HL', 'MD-HL', 'VBZ-HL', 'IN-HL', 'JJ-HL', 'PPLS', 'CD-HL', 'WPO', 'JJS-TL', 'ABL', 'BER-HL', 'PPS+HVZ', 'VBD-HL', 'RP-HL', 'MD*-HL', 'AP-HL', 'CS-HL', 'DT$', 'HVN', 'FW-IN', 'FW-DT', 'VBN-TL', 'NR-TL', 'NNS$-TL', 'FW-NN', 'HVG', 'DTX', 'OD-TL', 'BEM', 'RB-HL', 'PPSS+MD', 'NPS-HL', 'NPS$', 'WP$', 'NN-TL-HL', '

经常出现在名词之前的词性

新闻文本中最常见的动词

### 关于词性的条件频率表

给定词，按频率降序输出词性列表

In [None]:
cfdl = nltk.ConditionalFreqDist(wsj)
cfdl['yeild']

给定词性，按频率降序输出词性列表