In [8]:
def read_data():
    with open('./the_time_machine.txt', 'r') as txt:
        lines = txt.readlines()
    import re
    return [l for l in [re.sub('[^A-Za-z]+', ' ', line).strip().lower() for line in lines] if l.strip() != '']

In [9]:
lines = read_data()

In [10]:
len(lines)

3093

In [13]:
def tokenize(lines, token='word'):
    return [list(line) if token == 'char' else line.split() for line in lines]


In [15]:
class Vocab:
    
    def __init__(self, tokens=[], min_freq=0, reserved_tokens=[]):
        counter = Vocab.count_corpus(tokens)
        # 对词频率排序
        self.__token_freqs = sorted(counter.items(), key=lambda x: x[1], reverse=True)
        self.index_to_token = ['<unk>'] + reserved_tokens
        self.token_to_index = {token: idx for idx, token in enumerate(self.index_to_token)}
        for token, freq in self.__token_freqs:
            if freq >= min_freq and token not in self.token_to_index:
                self.index_to_token.append(token)
                self.token_to_index[token] = len(self.index_to_token) - 1
    
    def __len__(self):
        return len(self.index_to_token)
    
    def __getitem__(self, tokens):
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_index.get(tokens, self.unk)
        return [self.__getitem__(token) for token in tokens]
    
    @property
    def unk(self):
        return 0
    
    @property
    def token_freqs(self):
        return self.__token_freqs
        
    
    @staticmethod
    def count_corpus(tokens):
        if isinstance(tokens[0], list):
            tokens = [token for line in tokens for token in line]
        from collections import Counter
        return Counter(tokens)

In [18]:
tokens = tokenize(lines)
vocab = Vocab(tokens)

In [19]:
vocab.token_freqs

[('the', 2477),
 ('and', 1312),
 ('of', 1286),
 ('i', 1268),
 ('a', 877),
 ('to', 766),
 ('in', 606),
 ('was', 554),
 ('that', 458),
 ('it', 452),
 ('my', 441),
 ('had', 354),
 ('as', 281),
 ('me', 281),
 ('with', 264),
 ('at', 257),
 ('for', 247),
 ('you', 212),
 ('time', 211),
 ('but', 209),
 ('this', 199),
 ('or', 162),
 ('were', 158),
 ('on', 148),
 ('not', 142),
 ('from', 137),
 ('all', 136),
 ('then', 134),
 ('is', 129),
 ('have', 129),
 ('his', 129),
 ('there', 128),
 ('by', 126),
 ('he', 126),
 ('they', 124),
 ('one', 120),
 ('upon', 115),
 ('so', 114),
 ('into', 114),
 ('little', 114),
 ('be', 112),
 ('came', 107),
 ('no', 102),
 ('gutenberg', 98),
 ('some', 95),
 ('machine', 93),
 ('could', 93),
 ('an', 92),
 ('which', 92),
 ('we', 91),
 ('their', 91),
 ('said', 89),
 ('project', 88),
 ('saw', 88),
 ('down', 87),
 ('s', 86),
 ('very', 86),
 ('them', 86),
 ('now', 79),
 ('what', 78),
 ('these', 77),
 ('about', 77),
 ('any', 75),
 ('been', 75),
 ('her', 75),
 ('up', 74),
 ('out

In [22]:
tokens[666], vocab[tokens[666]]

(['that',
  'i',
  'noticed',
  'for',
  'the',
  'first',
  'time',
  'how',
  'warm',
  'the',
  'air',
  'was'],
 [9, 4, 518, 17, 1, 98, 19, 104, 698, 1, 199, 8])