<a href="https://colab.research.google.com/github/forMwish/MyDeepLearn/blob/master/d2l_8_2_TextPreprocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. 准备

In [1]:
# 挂载 gdrive，选择
from google.colab import drive
import os

gdrive_path = '/gdrive'
drive.mount(gdrive_path, force_remount=True)

os.chdir("%s/MyDrive"%gdrive_path)

name = "d2l_8.2"
try:
    os.mkdir(f"./{name}")
    os.chdir(f"./{name}")
except:
    os.chdir(f"./{name}")

# 安装 d2l
os.system("pip install d2l==0.17.5")

# 解决 matplot 相关问题
os.system("pip uninstall matplotlib")
os.system("pip install matplotlib==3.1.3")

Mounted at /gdrive


0

In [2]:
# 其它配置
import torch
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import random

# notebook 设置tag补全
%config Completer.use_jedi = False

# 优先使用 gpu 设备
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("use device:", device)

# pyplot 使用黑暗模式
plt.style.use("default")
# plt.style.use("dark_background")

# pytorch 随机种子固定
torch.manual_seed(0)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(0)

# numpy 随机种子固定
np.random.seed(0)

# python 随机种子固定
random.seed(0)

use device: cpu


  if __name__ == '__main__':


# 2. 开始

In [3]:
import collections
import re
from d2l import torch as d2l

d2l.DATA_HUB['time_machine'] = (d2l.DATA_URL + 'timemachine.txt',
                                '090b5e7e70c295757f55df93cb0a180b9691891a')

# 读取数据集
def read_time_machine():
    """将时间机器数据集加载到文本行的列表中"""
    with open(d2l.download('time_machine'), 'r') as f:
        lines = f.readlines()
        
    return [re.sub('[^A-Za-z]+', ' ', line).strip().lower() for line in lines], lines

lines, lines_orig = read_time_machine()
print(f'# 文本总行数: {len(lines)}')
print(lines[0],"\n", lines_orig[0])
print(lines[10],"\n", lines_orig[10])

# print(lines_orig[0])
# print(re.sub('[^A-Za-z]+', '1', lines_orig[0]))
# print(re.sub('[^A-Za-z]+', '1', lines_orig[0]).strip("1T"))

# 文本总行数: 3221
the time machine by h g wells 
 The Time Machine, by H. G. Wells [1898]

twinkled and his usually pale face was flushed and animated the 
 twinkled, and his usually pale face was flushed and animated. The



In [11]:
# 词元化（tokenize）
def tokenize(lines, token='word'):
    """ 将文本行拆分为单词或者字符
    """
    if token == 'word':
        return [line.split() for line in lines]
    elif token == "char":
        return [list(line) for line in lines]
    else:
        print("[error]")

tokens = tokenize(lines)
for i in range(11):
    print(tokens[i])

['the', 'time', 'machine', 'by', 'h', 'g', 'wells']
[]
[]
[]
[]
['i']
[]
[]
['the', 'time', 'traveller', 'for', 'so', 'it', 'will', 'be', 'convenient', 'to', 'speak', 'of', 'him']
['was', 'expounding', 'a', 'recondite', 'matter', 'to', 'us', 'his', 'grey', 'eyes', 'shone', 'and']
['twinkled', 'and', 'his', 'usually', 'pale', 'face', 'was', 'flushed', 'and', 'animated', 'the']


In [12]:
# 词表 （vocabulary）:
#    对语料中每个唯一词元的出现频率，为其分配一个数字索引。
#    很少出现的词元通常被移除，以降低复杂性。
#    <unk>: 语料库中不存在或已删除的任何词元
#    <pad>: 填充词元，pad长度
#    <bos>: 序列开始词元
#    <eos>: 序列结束词元
# 语料（corpus）:
#    将训练集中的所有文档合并在一起，对他们的唯一词元进行统计，得到的统计结果(也就是词元的频率)

def count_corpus(tokens):
    """ 统计词元的频率
    """
    # 这里的tokens是1D列表或2D列表
    if len(tokens) == 0:
        print("============")
    if len(tokens) == 0 or isinstance(tokens[0], list):
        # 将词元列表展平成一个列表
        tokens = [token for line in tokens for token in line]
    return collections.Counter(tokens)    

class Vocab:
    """ 文本词表
    """
    def __init__(self, tokens=None, min_freq=0, reserved_tokens=None):
        if tokens is None:
            tokens = []
        if reserved_tokens is None:
            reserved_tokens = []
        # 按出现频率排序
        self.counter = count_corpus(tokens)
        self._token_freqs = sorted(self.counter.items(), key=lambda x:x[1], reverse=True)

        # 未知词元的索引为 0, 并添加 reserved_tokens
        self.idx_to_token = ['<unk>'] + reserved_tokens
        self.token_to_idx = {token:idx for idx, token in enumerate(self.idx_to_token)}
        for token, freq in self._token_freqs:
            if freq < min_freq:
                break
            if token not in self.token_to_idx:
                self.idx_to_token.append(token)
                self.token_to_idx[token] = len(self.idx_to_token) - 1
    
    def __len__(self):
        return len(self.idx_to_token)
    
    def __getitem__(self, tokens):
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_idx.get(tokens, self.unk)
        return [self.getitem__(token) for token in tokens]

    def to_tokens(self, indices):
        if not isinstance(indices, (list, tuple)):
            return self.idx_to_token[indices]
        return [self.idx_to_token[index] for index in indices]

    @property # 修饰器，后面的函数可以当做属性进行调用
    def unk(self):
        return 0
    
    @property
    def token_freqs(self):
        return self._token_freqs


vocab = Vocab(tokens)
for i, data in enumerate( vocab.counter.most_common()): # 可以直接用 collections.counter.most_common() 返回排序后的数据
    print(data)
    print(vocab._token_freqs[i])
    if i>10:
        break

('the', 2261)
('the', 2261)
('i', 1267)
('i', 1267)
('and', 1245)
('and', 1245)
('of', 1155)
('of', 1155)
('a', 816)
('a', 816)
('to', 695)
('to', 695)
('was', 552)
('was', 552)
('in', 541)
('in', 541)
('that', 443)
('that', 443)
('my', 440)
('my', 440)
('it', 437)
('it', 437)
('had', 354)
('had', 354)


In [17]:
# 整合功能

def load_corpus_time_machine(max_tokens=-1):
    """ 返回时光机器数据集的词元索引列表和词表
    """
    lines, lines_orig = read_time_machine()
    tokens = tokenize(lines, "char")
    print(f"tokens:{tokens[:100]}")
    vocab = Vocab(tokens)
    print(vocab.token_freqs[:10])
    # 因为时光机器数据集中的每个文本行不一定是一个句子或一个段落，
    # 所以将所有文本行展平到一个列表中
    corpus = [vocab[token] for line in tokens for token in line]
    print(corpus[:100])    
    if max_tokens > 0:
        corpus = corpus[:max_tokens]
    return corpus, vocab

corpus, vocab = load_corpus_time_machine()
len(corpus), len(vocab)
print(vocab.idx_to_token[:10])

tokens:[['t', 'h', 'e', ' ', 't', 'i', 'm', 'e', ' ', 'm', 'a', 'c', 'h', 'i', 'n', 'e', ' ', 'b', 'y', ' ', 'h', ' ', 'g', ' ', 'w', 'e', 'l', 'l', 's'], [], [], [], [], ['i'], [], [], ['t', 'h', 'e', ' ', 't', 'i', 'm', 'e', ' ', 't', 'r', 'a', 'v', 'e', 'l', 'l', 'e', 'r', ' ', 'f', 'o', 'r', ' ', 's', 'o', ' ', 'i', 't', ' ', 'w', 'i', 'l', 'l', ' ', 'b', 'e', ' ', 'c', 'o', 'n', 'v', 'e', 'n', 'i', 'e', 'n', 't', ' ', 't', 'o', ' ', 's', 'p', 'e', 'a', 'k', ' ', 'o', 'f', ' ', 'h', 'i', 'm'], ['w', 'a', 's', ' ', 'e', 'x', 'p', 'o', 'u', 'n', 'd', 'i', 'n', 'g', ' ', 'a', ' ', 'r', 'e', 'c', 'o', 'n', 'd', 'i', 't', 'e', ' ', 'm', 'a', 't', 't', 'e', 'r', ' ', 't', 'o', ' ', 'u', 's', ' ', 'h', 'i', 's', ' ', 'g', 'r', 'e', 'y', ' ', 'e', 'y', 'e', 's', ' ', 's', 'h', 'o', 'n', 'e', ' ', 'a', 'n', 'd'], ['t', 'w', 'i', 'n', 'k', 'l', 'e', 'd', ' ', 'a', 'n', 'd', ' ', 'h', 'i', 's', ' ', 'u', 's', 'u', 'a', 'l', 'l', 'y', ' ', 'p', 'a', 'l', 'e', ' ', 'f', 'a', 'c', 'e', ' ', 'w',