In [3]:
import collections
import re
from d2l import torch as d2l


In [4]:
#@save
d2l.DATA_HUB['time_machine'] = (d2l.DATA_URL + 'timemachine.txt',
                                '090b5e7e70c295757f55df93cb0a180b9691891a')

def read_time_machine():  #@save
    """Load the time machine dataset into a list of text lines."""
    with open(d2l.download('time_machine'), 'r') as f:
        lines = f.readlines()
    return [re.sub('[^A-Za-z]+', ' ', line).strip().lower() for line in lines]

lines = read_time_machine()
print(f'# text lines: {len(lines)}')
print(lines[0])
print(lines[10])


# text lines: 3221
the time machine by h g wells
twinkled and his usually pale face was flushed and animated the


In [5]:
def tokenize( lines , token = 'word' ):
    if token == 'word':
        return [  line.split()   for line in lines]
    elif token == 'char':
        return [ list(line) for line in lines ]
    else:
        print( '错误，未知词元类型:'+token )
tokens = tokenize( lines , token='word' )
for i in range(10):
    print( tokens[i])

['the', 'time', 'machine', 'by', 'h', 'g', 'wells']
[]
[]
[]
[]
['i']
[]
[]
['the', 'time', 'traveller', 'for', 'so', 'it', 'will', 'be', 'convenient', 'to', 'speak', 'of', 'him']
['was', 'expounding', 'a', 'recondite', 'matter', 'to', 'us', 'his', 'grey', 'eyes', 'shone', 'and']


In [6]:
len( tokens)#正好对应3221行

3221

In [7]:
counters = collections.Counter(   [ token for line in tokens for token in line ] ) 
#Counter的作用就是计算当前元素的个数，返回的是Counter类,类似于字典
counters

Counter({'the': 2261,
         'time': 200,
         'machine': 85,
         'by': 103,
         'h': 1,
         'g': 1,
         'wells': 9,
         'i': 1267,
         'traveller': 61,
         'for': 221,
         'so': 112,
         'it': 437,
         'will': 37,
         'be': 93,
         'convenient': 5,
         'to': 695,
         'speak': 6,
         'of': 1155,
         'him': 40,
         'was': 552,
         'expounding': 2,
         'a': 816,
         'recondite': 1,
         'matter': 6,
         'us': 35,
         'his': 129,
         'grey': 11,
         'eyes': 35,
         'shone': 8,
         'and': 1245,
         'twinkled': 1,
         'usually': 3,
         'pale': 10,
         'face': 38,
         'flushed': 2,
         'animated': 3,
         'fire': 30,
         'burned': 6,
         'brightly': 4,
         'soft': 16,
         'radiance': 1,
         'incandescent': 1,
         'lights': 1,
         'in': 541,
         'lilies': 1,
         'silver': 6,
  

In [8]:
sorted(counters.items()  , key= lambda x:x[1] ) #这是按照升序排列

[('h', 1),
 ('g', 1),
 ('recondite', 1),
 ('twinkled', 1),
 ('radiance', 1),
 ('incandescent', 1),
 ('lights', 1),
 ('lilies', 1),
 ('bubbles', 1),
 ('glasses', 1),
 ('patents', 1),
 ('embraced', 1),
 ('submitted', 1),
 ('luxurious', 1),
 ('roams', 1),
 ('gracefully', 1),
 ('trammels', 1),
 ('precision', 1),
 ('lazily', 1),
 ('admired', 1),
 ('earnestness', 1),
 ('fecundity', 1),
 ('controvert', 1),
 ('universally', 1),
 ('school', 1),
 ('founded', 1),
 ('misconception', 1),
 ('argumentative', 1),
 ('accept', 1),
 ('nil', 1),
 ('plane', 1),
 ('abstractions', 1),
 ('instantaneous', 1),
 ('pensive', 1),
 ('extension', 1),
 ('duration', 1),
 ('infirmity', 1),
 ('draw', 1),
 ('distinction', 1),
 ('happens', 1),
 ('intermittently', 1),
 ('relight', 1),
 ('extensively', 1),
 ('accession', 1),
 ('cheerfulness', 1),
 ('mathematicians', 1),
 ('definable', 1),
 ('asking', 1),
 ('construct', 1),
 ('professor', 1),
 ('simon', 1),
 ('newcomb', 1),
 ('york', 1),
 ('similarly', 1),
 ('models', 1),
 (

In [9]:
counters.most_common()

[('the', 2261),
 ('i', 1267),
 ('and', 1245),
 ('of', 1155),
 ('a', 816),
 ('to', 695),
 ('was', 552),
 ('in', 541),
 ('that', 443),
 ('my', 440),
 ('it', 437),
 ('had', 354),
 ('me', 281),
 ('as', 270),
 ('at', 243),
 ('for', 221),
 ('with', 216),
 ('but', 204),
 ('time', 200),
 ('were', 158),
 ('this', 152),
 ('you', 137),
 ('on', 137),
 ('then', 134),
 ('his', 129),
 ('there', 127),
 ('he', 123),
 ('have', 122),
 ('they', 122),
 ('from', 122),
 ('one', 120),
 ('all', 118),
 ('not', 114),
 ('into', 114),
 ('upon', 113),
 ('little', 113),
 ('so', 112),
 ('is', 106),
 ('came', 105),
 ('by', 103),
 ('some', 94),
 ('be', 93),
 ('no', 92),
 ('could', 92),
 ('their', 91),
 ('said', 89),
 ('saw', 88),
 ('down', 87),
 ('them', 86),
 ('machine', 85),
 ('which', 85),
 ('very', 85),
 ('or', 84),
 ('an', 84),
 ('we', 82),
 ('now', 79),
 ('what', 77),
 ('been', 75),
 ('these', 74),
 ('like', 74),
 ('her', 74),
 ('out', 73),
 ('seemed', 72),
 ('up', 71),
 ('man', 70),
 ('about', 70),
 ('s', 70),
 

In [10]:
def count_corpus( tokens ):
    #这句主要是将单个列表进行操作，对于二维列表不进行操作
    if len( tokens )==0 or isinstance( tokens[0] , list ):
        #将二维词元列表展成一维词元列表
        tokens = [ token for line in tokens for token in line ]
    return collections.Counter( tokens ) #返回的是相当于一个字典的Counter类,字典的key为词元，values为在该文本出现的频率
    #在书里被定义为corpus-----语料

class Vocab:
    # 该类用来将字符串类型的词元映射到从0开始的数字索引中。
    def __init__( self , tokens =None ,  min_freq = 0 , reserved_tokens = None ):
        if tokens is None:
            tokens = []
        if reserved_tokens is None:
            reserved_tokens = []
        counter = count_corpus( tokens )#得到的是当前词元和频率的元组
        self.token_freqs = sorted( counter.items() , key= lambda x:x[1] ,  reverse= True)#按照降序进行排序
        #实际上这么复杂的一段可以用以下语句来代替
        #self.token_freqs = counter.most_common()
        #直接降序返回一个列表，不用sorted去实现一个降序列表，方便快捷，甚至我们
        #可以将这一段写到语料统计函数内，直接返回语料。上面这一段属实麻烦。

        #好了，这个时候我们得到一个降序语料(语料为当前词元和频率的元组)的列表，现在我们需要将词元和频率提取出来，让他们一一映射
        #首先，先搞一个不清楚的语料，用来接收不在当前词汇表的词元，将其idx设置为0
        self.unknown , uniq_tokens = 0 , [ '<unknown>']+reserved_tokens
        #将降序词元提取出来加入到uniq_tokens
        uniq_tokens  +=  [ token for token , freq in self.token_freqs if freq >= min_freq and token not in uniq_tokens ]
        #用一个列表成员去接收降序词元 ，方便使用下标去词元进行提取; 用一个字典去存储词元对应的下标，方便使用词元提取下标
        self.idx_to_token , self.token_to_idx = [] ,dict()
        for token in uniq_tokens:
            self.idx_to_token.append( token )
            self.token_to_idx[token ] = len( self.idx_to_token ) -1

    def __len__( self ):
        return len( self.idx_to_token )
    #词元提取下标
    def __getitem__( self , tokens ):
        if not isinstance( tokens , (list ,tuple )):#第一次传进去判断为list
            return self.token_to_idx.get( tokens , self.unknown )
        return [ self.__getitem__( token)  for token in tokens ]#之后不断调用__getitem__方法，直到将列表所有词元提取出下标
        #有点递归那味了
    
    #下标提取词元 , 同理
    def to_tokens( self , indices ):
        if  not isinstance( tokens , ( list , tuple  ) ):
            return self.idx_to_token[ indices ]
        return [ self.to_tokens( index )   for index in indices ]




In [11]:
vocab = Vocab(tokens)
print( vocab.idx_to_token[:10])
print(list(vocab.token_to_idx.items())[:10])#字典的items，values , keys方法返回的都是视图对象，需要使用list方法转换为列表
print( list( vocab.token_to_idx.values() )[:10])

['<unknown>', 'the', 'i', 'and', 'of', 'a', 'to', 'was', 'in', 'that']
[('<unknown>', 0), ('the', 1), ('i', 2), ('and', 3), ('of', 4), ('a', 5), ('to', 6), ('was', 7), ('in', 8), ('that', 9)]
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


In [12]:
vocab = Vocab( tokens )#得到词汇表
for i in [ 0 , 10 ]:
    print( 'word:', vocab[tokens[i]] )


word: [1, 19, 50, 40, 2183, 2184, 400]
word: [2186, 3, 25, 1044, 362, 113, 7, 1421, 3, 1045, 1]


## 所有功能全部整合：
1. 加载全部文本
2. 将文本进行词元化，生成二维列表
3. 处理好的词元列表放入Vocab里，内置有token_freqs( 降序的词元频率对)，idx_to_token(词元列表)，token_to_idx(词元索引字典)等属性
4. 将文本每一行的词元转换成索引

In [13]:
def load_corpus_time_machine( max_tokens = -1 ):
    lines = read_time_machine()
    tokens = tokenize( lines , token=  'char' )
    vocab = Vocab( tokens )

    corpus = [  vocab[token]  for lines  in tokens for token in lines ]#将每一行的索引取出并放置到一维列表当中
    if max_tokens > 0 :
        corpus =  corpus[:max_tokens ] 
    return corpus  , vocab 
    

In [24]:
corpus , vocab = load_corpus_time_machine( max_tokens= 10 )
corpus , list(vocab.token_to_idx.items())[:14] , tokenize( read_time_machine() , token= 'char' )[0][:10]
#通过打印结果可知，结果正确，这里第二个变量就是查看词汇索引字典的前十三位，第三个变量是查看当前文本第一行的前十个词元

([3, 9, 2, 1, 3, 5, 13, 2, 1, 13],
 [('<unknown>', 0),
  (' ', 1),
  ('e', 2),
  ('t', 3),
  ('a', 4),
  ('i', 5),
  ('n', 6),
  ('o', 7),
  ('s', 8),
  ('h', 9),
  ('r', 10),
  ('d', 11),
  ('l', 12),
  ('m', 13)],
 ['t', 'h', 'e', ' ', 't', 'i', 'm', 'e', ' ', 'm'])