# 词性标注器

In [1]:
import nltk
text = nltk.word_tokenize("And now for something completely different")
print(nltk.pos_tag(text))
nltk.help.upenn_tagset('RB')

[('And', 'CC'), ('now', 'RB'), ('for', 'IN'), ('something', 'NN'), ('completely', 'RB'), ('different', 'JJ')]
RB: adverb
    occasionally unabatingly maddeningly adventurously professedly
    stirringly prominently technologically magisterially predominately
    swiftly fiscally pitilessly ...


# 标注语料库-表示已标注的标识符

In [2]:
tagged_token = nltk.tag.util.str2tuple('fly/NN')#由词性标注字符串构建元组
print(tagged_token[1])
sent = '''The/AT grand/JJ jury/NN commented/VBD on/IN'''
print([nltk.tag.util.str2tuple(t) for t in sent.split()])

NN
[('The', 'AT'), ('grand', 'JJ'), ('jury', 'NN'), ('commented', 'VBD'), ('on', 'IN')]


# 标注语料库-读取已标注的语料库

ADJ 形容词 new, good, high, special, big, local

ADV 动词 really, already, still, early, now

CNJ 连词 and, or, but, if, while, although

DET 限定词 the, a, some, most, every, no

EX 存在量词 there, there's

FW 外来词 dolce, ersatz, esprit, quo, maitre

MOD 情态动词 will, can, would, may, must, should

N 名词 year, home, costs, time, education

NP 专有名词 Alison, Africa, April, Washington

NUM 数词 twenty-four, fourth, 1991, 14:24

PRO 代词 he, their, her, its, my, I, us

P 介词 on, of, at, with, by, into, under

TO 词 to to

UH 感叹词 ah, bang, ha, whee, hmpf, oops

V 动词 is, has, get, do, make, see, run

VD 过去式 said, took, told, made, asked

VG 现在分词 making, going, playing, working

VN 过去分词 given, taken, begun, sung

WH Wh 限定词 who, which, when, what, where, how

In [3]:
from nltk.corpus import brown
from matplotlib import pyplot as plt
print(brown.tagged_words(tagset='universal'))
brown_news_tagged = brown.tagged_words(categories='news', tagset='universal')
tag_fd = nltk.FreqDist(tag for (word, tag) in brown_news_tagged)
print(tag_fd.keys())
tag_fd.plot(cumulative=True)

[('The', 'DET'), ('Fulton', 'NOUN'), ...]
dict_keys(['DET', 'NOUN', 'ADJ', 'VERB', 'ADP', '.', 'ADV', 'CONJ', 'PRT', 'PRON', 'NUM', 'X'])


<Figure size 640x480 with 1 Axes>

# 标注语料库-名词
名词出现在限定词和形容词之后，包括数字形容词(数词，标注为 N U M )。

In [4]:
word_tag_pairs = nltk.bigrams(brown_news_tagged)
list(nltk.FreqDist(a[1] for (a, b) in word_tag_pairs if b[1] == 'NOUN'))

['DET',
 'NOUN',
 'ADJ',
 'VERB',
 'ADP',
 'CONJ',
 '.',
 'NUM',
 'ADV',
 'PRT',
 'X',
 'PRON']

# 标注语料库-动词

按频率排序所有动词

In [12]:
wsj = brown.tagged_words(categories='news',tagset='universal')
word_tag_fd=nltk.FreqDist(wsj)
vword=[word + "/" + tag for (word, tag) in word_tag_fd if tag.startswith('V')]
print (vword[:10])
cfd1=nltk.ConditionalFreqDist(wsj)
print (cfd1['yield'].keys())

['said/VERB', 'produced/VERB', 'took/VERB', 'had/VERB', 'deserves/VERB', 'was/VERB', 'conducted/VERB', 'been/VERB', 'charged/VERB', 'investigate/VERB']
dict_keys(['VERB', 'NOUN'])


# 标注语料库-未简化的标记

找出最频繁的名词标记的程序

In [30]:
def findtags(tag_prefix, tagged_text):
    cfd = nltk.ConditionalFreqDist((tag, word) for (word, tag) in tagged_text \
                                   if tag.startswith(tag_prefix))
    return dict((tag,list(cfd[tag].keys())[:5]) for tag in cfd.conditions())

tagdict = findtags('N', brown.tagged_words(categories='news'))
n = 0
for tag in sorted(tagdict):
    print(tag,tagdict[tag])
    n=n+1
    if n==10:
        break

NN ['investigation', 'primary', 'election', 'evidence', 'place']
NN$ ["ordinary's", "court's", "mayor's", "wife's", "governor's"]
NN$-HL ["Golf's", "Navy's"]
NN$-TL ["Department's", "Commissioner's", "President's", "Party's", "Mayor's"]
NN-HL ['Merger', 'jail', 'Construction', 'fund', 'sp.']
NN-NC ['ova', 'eva', 'aya']
NN-TL ['County', 'Jury', 'City', 'Committee', 'Court']
NN-TL-HL ['Mayor', 'Commissioner', 'City', 'Oak', 'Grove']
NNS ['irregularities', 'presentments', 'thanks', 'reports', 'voters']
NNS$ ["taxpayers'", "children's", "members'", "women's", "years'"]


# 标注语料库-已标注的语料库

假设我们正在研究词 often，想看看它是如何在文本中使用的。我们可以试着看看跟在 often 后面的词汇

请注意 often 后面最高频率的词性是动词。名词从来没有在这个位置出现

In [36]:
brown_learned_text = brown.tagged_words(categories='learned',tagset='universal')
tags=[b[1] for (a, b) in nltk.bigrams(brown_learned_text) if a[0] == 'often']
fd = nltk.FreqDist(tags)
fd.tabulate()

VERB  ADV  ADP  ADJ    .  PRT 
  37    8    7    6    4    2 


In [40]:
#使用POS标记寻找三词短语
def process(sentence):
    for (w1,t1), (w2,t2), (w3,t3) in nltk.trigrams(sentence):
        if (t1.startswith('V') and t2 == 'TO' and t3.startswith('V')):
            print (w1, w2, w3)

n = 0
for tagged_sent in brown.tagged_sents():
    process(tagged_sent)
    n=n+1
    if n==100:
        break

combined to achieve
continue to place
serve to protect
wanted to wait
allowed to place
expected to become
expected to approve
expected to make
intends to make
seek to set
like to see
