In [38]:
import collections
import spacy


class Vocab:
    """
    Vocabulary for text(参考`torchtext.vocab`)
    """

    def __init__(self, tokens=None, min_freq=2, reserved_tokens=None):
        # min_freq: The minimum frequency needed to include a token in the vocabulary.
        # reserved_tokens: 自定义tokens
        if tokens is None:
            tokens = []
        if reserved_tokens is None:
            reserved_tokens = []
        counter = collections.Counter(tokens)
        # Sort according to frequencies
        self._token_freqs = sorted(counter.items(), key=lambda x: x[1], reverse=True)
        # The index for the unknown token is 0
        self.idx_to_token = ['<unk>'] + reserved_tokens
        self.token_to_idx = {token: idx for idx, token in enumerate(self.idx_to_token)}
        for token, freq in self._token_freqs:
            if freq < min_freq:
                break
            if token not in self.token_to_idx:
                self.idx_to_token.append(token)
                self.token_to_idx[token] = len(self.idx_to_token) - 1

    def __len__(self):
        return len(self.idx_to_token)

    def __getitem__(self, tokens):
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_idx.get(tokens, self.unk)
        return [self.__getitem__(token) for token in tokens]

    def to_tokens(self, indices):
        """第indices位置处的token"""
        if not isinstance(indices, (list, tuple)):
            return self.idx_to_token[indices]
        return [self.idx_to_token[index] for index in indices]

    @property
    def unk(self):
        """Index for the unknown token"""
        return 0

    @property
    def token_freqs(self):
        return self._token_freqs

In [39]:
txt = "ust as you need air to breathe , you need opportunity to succeed . It takes more than just breathing in the fresh air of opportunity , however . You must make use of that opportunity . That's not up to the opportunity . That's up to you . It doesn't matter what \" floor \" the opportunity is on . What matters is what you do with it ."

nlp_en = spacy.load('en_core_web_sm')
split_lst = [tok.text for tok in nlp_en(txt)]
split_lst

['ust',
 'as',
 'you',
 'need',
 'air',
 'to',
 'breathe',
 ',',
 'you',
 'need',
 'opportunity',
 'to',
 'succeed',
 '.',
 'It',
 'takes',
 'more',
 'than',
 'just',
 'breathing',
 'in',
 'the',
 'fresh',
 'air',
 'of',
 'opportunity',
 ',',
 'however',
 '.',
 'You',
 'must',
 'make',
 'use',
 'of',
 'that',
 'opportunity',
 '.',
 'That',
 "'s",
 'not',
 'up',
 'to',
 'the',
 'opportunity',
 '.',
 'That',
 "'s",
 'up',
 'to',
 'you',
 '.',
 'It',
 'does',
 "n't",
 'matter',
 'what',
 '"',
 'floor',
 '"',
 'the',
 'opportunity',
 'is',
 'on',
 '.',
 'What',
 'matters',
 'is',
 'what',
 'you',
 'do',
 'with',
 'it',
 '.']

In [40]:
# '<unk>':未知词元
# '<pad>':填充词元
# '<sos>':开始词元
# '<eos>':结束词元
src_vocab = Vocab(split_lst, min_freq=0, reserved_tokens=['<pad>', '<sos>', '<eos>'])
len(src_vocab)

48

In [41]:
src_vocab.token_freqs

[('.', 7),
 ('opportunity', 5),
 ('you', 4),
 ('to', 4),
 ('the', 3),
 ('need', 2),
 ('air', 2),
 (',', 2),
 ('It', 2),
 ('of', 2),
 ('That', 2),
 ("'s", 2),
 ('up', 2),
 ('what', 2),
 ('"', 2),
 ('is', 2),
 ('ust', 1),
 ('as', 1),
 ('breathe', 1),
 ('succeed', 1),
 ('takes', 1),
 ('more', 1),
 ('than', 1),
 ('just', 1),
 ('breathing', 1),
 ('in', 1),
 ('fresh', 1),
 ('however', 1),
 ('You', 1),
 ('must', 1),
 ('make', 1),
 ('use', 1),
 ('that', 1),
 ('not', 1),
 ('does', 1),
 ("n't", 1),
 ('matter', 1),
 ('floor', 1),
 ('on', 1),
 ('What', 1),
 ('matters', 1),
 ('do', 1),
 ('with', 1),
 ('it', 1)]

In [42]:
src_vocab.token_to_idx

{'<unk>': 0,
 '<pad>': 1,
 '<sos>': 2,
 '<eos>': 3,
 '.': 4,
 'opportunity': 5,
 'you': 6,
 'to': 7,
 'the': 8,
 'need': 9,
 'air': 10,
 ',': 11,
 'It': 12,
 'of': 13,
 'That': 14,
 "'s": 15,
 'up': 16,
 'what': 17,
 '"': 18,
 'is': 19,
 'ust': 20,
 'as': 21,
 'breathe': 22,
 'succeed': 23,
 'takes': 24,
 'more': 25,
 'than': 26,
 'just': 27,
 'breathing': 28,
 'in': 29,
 'fresh': 30,
 'however': 31,
 'You': 32,
 'must': 33,
 'make': 34,
 'use': 35,
 'that': 36,
 'not': 37,
 'does': 38,
 "n't": 39,
 'matter': 40,
 'floor': 41,
 'on': 42,
 'What': 43,
 'matters': 44,
 'do': 45,
 'with': 46,
 'it': 47}

In [43]:
# idx to token
src_vocab.to_tokens(5)

'opportunity'

In [44]:
# token to idx
src_vocab['<pad>']

1

In [45]:
src_vocab['with']

46