In [94]:
from torchtext.vocab import vocab, build_vocab_from_iterator
from collections import Counter, OrderedDict
import spacy

In [95]:
txt = "ust as you need air to breathe , you need opportunity to succeed . It takes more than just breathing in the fresh air of opportunity , however . You must make use of that opportunity . That's not up to the opportunity . That's up to you . It doesn't matter what \" floor \" the opportunity is on . What matters is what you do with it ."

nlp_en = spacy.load('en_core_web_sm')
split_lst = [tok.text for tok in nlp_en(txt)]
split_lst

['ust',
 'as',
 'you',
 'need',
 'air',
 'to',
 'breathe',
 ',',
 'you',
 'need',
 'opportunity',
 'to',
 'succeed',
 '.',
 'It',
 'takes',
 'more',
 'than',
 'just',
 'breathing',
 'in',
 'the',
 'fresh',
 'air',
 'of',
 'opportunity',
 ',',
 'however',
 '.',
 'You',
 'must',
 'make',
 'use',
 'of',
 'that',
 'opportunity',
 '.',
 'That',
 "'s",
 'not',
 'up',
 'to',
 'the',
 'opportunity',
 '.',
 'That',
 "'s",
 'up',
 'to',
 'you',
 '.',
 'It',
 'does',
 "n't",
 'matter',
 'what',
 '"',
 'floor',
 '"',
 'the',
 'opportunity',
 'is',
 'on',
 '.',
 'What',
 'matters',
 'is',
 'what',
 'you',
 'do',
 'with',
 'it',
 '.']

In [96]:
counter = Counter(split_lst)
ordered_dict = OrderedDict(counter)
ordered_dict

OrderedDict([('ust', 1),
             ('as', 1),
             ('you', 4),
             ('need', 2),
             ('air', 2),
             ('to', 4),
             ('breathe', 1),
             (',', 2),
             ('opportunity', 5),
             ('succeed', 1),
             ('.', 7),
             ('It', 2),
             ('takes', 1),
             ('more', 1),
             ('than', 1),
             ('just', 1),
             ('breathing', 1),
             ('in', 1),
             ('the', 3),
             ('fresh', 1),
             ('of', 2),
             ('however', 1),
             ('You', 1),
             ('must', 1),
             ('make', 1),
             ('use', 1),
             ('that', 1),
             ('That', 2),
             ("'s", 2),
             ('not', 1),
             ('up', 2),
             ('does', 1),
             ("n't", 1),
             ('matter', 1),
             ('what', 2),
             ('"', 2),
             ('floor', 1),
             ('is', 2),
             ('on',

In [97]:
# Factory method for creating a vocab object which maps tokens to indices.
# min_freq – The minimum frequency needed to include a token in the vocabulary.
v1 = vocab(ordered_dict, min_freq=2)  # 默认min_freq=2
v1

Vocab()

In [98]:
# The length of the vocab.
len(v1)

16

In [99]:
# Dictionary mapping tokens to indices.
v1.get_stoi()

{'to': 3,
 'you': 0,
 'need': 1,
 'air': 2,
 ',': 4,
 'opportunity': 5,
 '.': 6,
 'It': 7,
 'the': 8,
 'of': 9,
 'That': 10,
 "'s": 11,
 'up': 12,
 'what': 13,
 '"': 14,
 'is': 15}

In [100]:
# List mapping indices to tokens.
v1.get_itos()  # 索引为v1.get_stoi().values()

['you',
 'need',
 'air',
 'to',
 ',',
 'opportunity',
 '.',
 'It',
 'the',
 'of',
 'That',
 "'s",
 'up',
 'what',
 '"',
 'is']

In [102]:
print(v1['is'])  # 只能索引单个单词
print(v1(['is', 'what']))  # 支持索引多个单词
print(v1(['out of vocal']))

15
[15, 13]


RuntimeError: Token out of vocal not found and default index is not set

In [103]:
unk_token = '<unk>'  # 标识符:低频词或未在词表中的词
pad_token = '<pad>'  # 标识符:补全字符
begin_token = 'SOS'  # 句子起始标识符
end_token = 'EOS'  # 句子结束标识符
"""
token – The token used to lookup the corresponding index.
index – The index corresponding to the associated token.
"""
v1.insert_token(unk_token, 0)
v1.insert_token(pad_token, 1)
v1.get_stoi()

{'<unk>': 0,
 '<pad>': 1,
 'to': 5,
 'you': 2,
 'need': 3,
 'air': 4,
 ',': 6,
 'opportunity': 7,
 '.': 8,
 'It': 9,
 'the': 10,
 'of': 11,
 'That': 12,
 "'s": 13,
 'up': 14,
 'what': 15,
 '"': 16,
 'is': 17}

In [104]:
default_index = -1
# index – Value of default index. This index will be returned when OOV token is queried.
v1.set_default_index(index=default_index)

In [105]:
print(v1(['out of vocab']))

[-1]


In [106]:
# Build a Vocab from an iterator.
# 内部已经实现了词频统计
v2 = build_vocab_from_iterator(iterator=[split_lst], min_freq=2)
v2.get_itos()

['.',
 'opportunity',
 'to',
 'you',
 'the',
 '"',
 "'s",
 ',',
 'It',
 'That',
 'air',
 'is',
 'need',
 'of',
 'up',
 'what']

In [107]:
v2(['of', 'up'])

[13, 14]