https://www.cnblogs.com/wwj99/p/12503545.html
BPE 算法

GPT-2 模型在数据预处理时使用了字节对编码（Byte Pair Encoding，简称 BPE）方法，BPE 是一种能够解决未登录词问题，并减小词典大小的方法。它综合利用了单词层面编码和字符层面编码的优势，举例来说，我们要对下面的字符串编码，

aaabdaaabac
字节对 aa 出现的次数最多，所以我们将它替换成一个没在字符串中被用过的字符 Z ，

ZabdZabac
Z=aa
然后我们重复这个过程，用 Y 替换 ab ，

ZYdZYac
Y=ab
Z=aa
继续，用 X 替换 ZY ，

XdXac
X=ZY
Y=ab
Z=aa

In [0]:
import re
import collections

def get_stats(vocab):
    pairs = collections.defaultdict(int)
    for word, freq in vocab.items():
        symbols = word.split()
        for i in range(len(symbols)-1):
            pairs[symbols[i], symbols[i+1]] += freq  # 计算字节对出现频率
    return pairs


def merge_vocab(pair, v_in):
    v_out = {}
    bigram = re.escape(' '.join(pair))  # 将字节对中可解释为正则运算符的字符转义
    p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')  # 将要合并的字节对前后只能为空白字符
    for word in v_in:
        w_out = p.sub(''.join(pair), word)  # 合并符合条件的字节对
        v_out[w_out] = v_in[word]
    return v_out

vocab = {'l o w </w>': 5, 'l o w e r </w>': 2,
         'n e w e s t </w>': 6, 'w i d e s t </w>': 3}
num_merges = 10
for i in range(num_merges):
    pairs = get_stats(vocab)
    best = max(pairs, key=pairs.get)  # 选择频率最大的字节对
    vocab = merge_vocab(best, vocab)
    print(best)
print(vocab)


('e', 's')
('es', 't')
('est', '</w>')
('l', 'o')
('lo', 'w')
('n', 'e')
('ne', 'w')
('new', 'est</w>')
('low', '</w>')
('w', 'i')
{'low</w>': 5, 'low e r </w>': 2, 'newest</w>': 6, 'wi d est</w>': 3}


In [0]:
# top-k
import random

def select_top_k(predictions, k=10):
    predicted_index = random.choice(
        predictions[0, -1, :].sort(descending=True)[1][:10]).item()
    return predicted_index

In [0]:
# 使用在 PyTorch-Transformers 模型库中封装好的 GPT2Tokenizer() 和 GPT2LMHeadModel()
# 安装 PyTorch-Transformers
!pip install pytorch_transformers==1.0
# !pip install pytorch_transformers==1.0 -i  https://pypi.tuna.tsinghua.edu.cn/simple/

Collecting pytorch_transformers==1.0
[?25l  Downloading https://files.pythonhosted.org/packages/40/b5/2d78e74001af0152ee61d5ad4e290aec9a1e43925b21df2dc74ec100f1ab/pytorch_transformers-1.0.0-py3-none-any.whl (137kB)
[K     |██▍                             | 10kB 27.2MB/s eta 0:00:01[K     |████▊                           | 20kB 3.0MB/s eta 0:00:01[K     |███████▏                        | 30kB 4.4MB/s eta 0:00:01[K     |█████████▌                      | 40kB 3.0MB/s eta 0:00:01[K     |████████████                    | 51kB 3.6MB/s eta 0:00:01[K     |██████████████▎                 | 61kB 4.3MB/s eta 0:00:01[K     |████████████████▊               | 71kB 5.0MB/s eta 0:00:01[K     |███████████████████             | 81kB 3.9MB/s eta 0:00:01[K     |█████████████████████▌          | 92kB 4.4MB/s eta 0:00:01[K     |███████████████████████▉        | 102kB 4.8MB/s eta 0:00:01[K     |██████████████████████████▎     | 112kB 4.8MB/s eta 0:00:01[K     |███████████████████████

In [0]:
# https://huggingface.co/transformers/pretrained_models.html?highlight=bert%20base
# bert-base-chinese

In [0]:
import torch
from pytorch_transformers import BertModel, GPT2Model, GPT2Tokenizer

import logging
logging.basicConfig(level=logging.INFO)

# 载入预训练模型的分词器
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# 使用 GPT2Tokenizer 对输入进行编码
# text = "Yesterday, a man named Jack said he saw an alien,"
text = "昨天, 一个名叫杰克的人说他看到了一头狮子,"
indexed_tokens = tokenizer.encode(text)
tokens_tensor = torch.tensor([indexed_tokens])
tokens_tensor.shape

INFO:pytorch_transformers.file_utils:https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json not found in cache, downloading to /tmp/tmpe3ttrjl6
100%|██████████| 1042301/1042301 [00:00<00:00, 2774156.88B/s]
INFO:pytorch_transformers.file_utils:copying /tmp/tmpe3ttrjl6 to cache at /root/.cache/torch/pytorch_transformers/f2808208f9bec2320371a9f5f891c184ae0b674ef866b79c58177067d15732dd.1512018be4ba4e8726e41b9145129dc30651ea4fec86aa61f4b9f40bf94eac71
INFO:pytorch_transformers.file_utils:creating metadata file for /root/.cache/torch/pytorch_transformers/f2808208f9bec2320371a9f5f891c184ae0b674ef866b79c58177067d15732dd.1512018be4ba4e8726e41b9145129dc30651ea4fec86aa61f4b9f40bf94eac71
INFO:pytorch_transformers.file_utils:removing temp file /tmp/tmpe3ttrjl6
INFO:pytorch_transformers.file_utils:https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt not found in cache, downloading to /tmp/tmppvrrgn42
100%|██████████| 456318/456318 [00:00<00:00, 1764624.48B/s]
INFO:pytorch_

torch.Size([1, 35])

In [0]:
print(indexed_tokens)


[23626, 101, 25465, 11, 220, 31660, 10310, 103, 28938, 235, 20998, 104, 30266, 108, 17739, 233, 21410, 21689, 46237, 112, 20015, 244, 40367, 233, 26344, 108, 12859, 228, 31660, 13783, 112, 45379, 106, 36310, 11]


In [0]:
from pytorch_transformers import GPT2LMHeadModel

# 读取 GPT-2 预训练模型
# model = GPT2LMHeadModel.from_pretrained("./")
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.eval()

total_predicted_text = text
n = 100  # 预测过程的循环次数
for _ in range(n):
    with torch.no_grad():
        outputs = model(tokens_tensor)
        predictions = outputs[0]

    predicted_index = select_top_k(predictions, k=10)
    predicted_text = tokenizer.decode(indexed_tokens + [predicted_index])
    total_predicted_text += tokenizer.decode(predicted_index)

    if '<|endoftext|>' in total_predicted_text:
        # 如果出现文本结束标志，就结束文本生成
        break

    indexed_tokens += [predicted_index]
    tokens_tensor = torch.tensor([indexed_tokens])

print(total_predicted_text)

INFO:pytorch_transformers.file_utils:https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json not found in cache, downloading to /tmp/tmppn7o5v9f
100%|██████████| 224/224 [00:00<00:00, 56312.88B/s]
INFO:pytorch_transformers.file_utils:copying /tmp/tmppn7o5v9f to cache at /root/.cache/torch/pytorch_transformers/4be02c5697d91738003fb1685c9872f284166aa32e061576bbe6aaeb95649fcf.699bbd1c449e9861456f359d6daa51bd523ac085b4b531ab0aad5a55d091e942
INFO:pytorch_transformers.file_utils:creating metadata file for /root/.cache/torch/pytorch_transformers/4be02c5697d91738003fb1685c9872f284166aa32e061576bbe6aaeb95649fcf.699bbd1c449e9861456f359d6daa51bd523ac085b4b531ab0aad5a55d091e942
INFO:pytorch_transformers.file_utils:removing temp file /tmp/tmppn7o5v9f
INFO:pytorch_transformers.modeling_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json from cache at /root/.cache/torch/pytorch_transformers/4be02c5697d91738003fb1685c9872f284166aa32e061576

昨天, 一个名叫杰克的人说他看到了一头狮子,������的��。����不�����的�������������,���������������,不���������的�������,������的������������的人�������,��


微调生成戏剧文本
接下来，我们将使用一些戏剧剧本对 GPT-2 进行微调。由于 OpenAI 团队开源的 GPT-2 模型预训练参数为使用英文数据集预训练后得到的，虽然可以在微调时使用中文数据集，但需要大量数据和时间才会有好的效果，所以这里我们使用了英文数据集进行微调，从而更好地展现 GPT-2 模型的能力。

首先，下载训练数据集，这里使用了莎士比亚的戏剧作品《罗密欧与朱丽叶》作为训练样本。数据集已经提前下载好并放在云盘中，链接：https://pan.baidu.com/s/1LiTgiake1KC8qptjRncJ5w 提取码：km06

In [0]:

from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/My\ Drive/Colab Notebooks/Lesson12

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive
/content/drive/My Drive/Colab Notebooks/Lesson12


In [0]:
# with open('./Datas/romeo_and_juliet.txt', 'r') as f:星辰变
with open('./Datas/星辰变.txt', 'r') as f:
    dataset = f.read()

print(len(dataset))

dataset = dataset[:len(dataset)//50]

# 预处理训练集，将训练集编码、分段。
indexed_text = tokenizer.encode(dataset)
del(dataset)

dataset_cut = []
for i in range(len(indexed_text)//512):
    # 将字符串分段成长度为 512
    dataset_cut.append(indexed_text[i*512:i*512+512])
del(indexed_text)

dataset_tensor = torch.tensor(dataset_cut)
dataset_tensor.shape




# 这里使用 PyTorch 提供的 DataLoader() 构建训练集数据集表示，使用 TensorDataset() 构建训练集数据迭代器。
from torch.utils.data import DataLoader, TensorDataset

# 构建数据集和数据迭代器，设定 batch_size 大小为 2
train_set = TensorDataset(dataset_tensor,
                          dataset_tensor)  # 标签与样本数据相同
train_loader = DataLoader(dataset=train_set,
                          batch_size=2,
                          shuffle=False)
print(train_loader)


# 检查是否机器有 GPU，如果有就在 GPU 运行，否则就在 CPU 运行。
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('device: ', device)

# 开始训练
from torch import nn
from torch.autograd import Variable
import time

pre = time.time()

epoch = 300  # 循环学习 30 次

model.to(device)
model.train()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)  # 定义优化器

for i in range(epoch):
    total_loss = 0
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = Variable(data).to(device), Variable(
            target).to(device)

        optimizer.zero_grad()

        loss, logits, _ = model(data, labels=target)

        total_loss += loss

        loss.backward()
        optimizer.step()

        if batch_idx == len(train_loader)-1:
            # 在每个 Epoch 的最后输出一下结果
            print(i, 'average loss:', total_loss/len(train_loader))

print('训练时间：', time.time()-pre)



2972704
<torch.utils.data.dataloader.DataLoader object at 0x7fd7ac2919e8>
device:  cuda
0 average loss: tensor(1.5146, device='cuda:0', grad_fn=<DivBackward0>)
1 average loss: tensor(1.4992, device='cuda:0', grad_fn=<DivBackward0>)
2 average loss: tensor(1.4889, device='cuda:0', grad_fn=<DivBackward0>)
3 average loss: tensor(1.4771, device='cuda:0', grad_fn=<DivBackward0>)
4 average loss: tensor(1.4663, device='cuda:0', grad_fn=<DivBackward0>)
5 average loss: tensor(1.4555, device='cuda:0', grad_fn=<DivBackward0>)
6 average loss: tensor(1.4458, device='cuda:0', grad_fn=<DivBackward0>)
7 average loss: tensor(1.4365, device='cuda:0', grad_fn=<DivBackward0>)
8 average loss: tensor(1.4215, device='cuda:0', grad_fn=<DivBackward0>)
9 average loss: tensor(1.4130, device='cuda:0', grad_fn=<DivBackward0>)
10 average loss: tensor(1.4040, device='cuda:0', grad_fn=<DivBackward0>)
11 average loss: tensor(1.3914, device='cuda:0', grad_fn=<DivBackward0>)
12 average loss: tensor(1.3807, device='cuda:0

In [0]:
# 训练结束后，可以使模型生成文本，观察输出。
text = "秦羽从府邸中出来，急速向南飞去"  # 这里也可以输入不同的英文文本

In [0]:
indexed_tokens = tokenizer.encode(text)
tokens_tensor = torch.tensor([indexed_tokens])

model.eval()
total_predicted_text = text

# 使训练后的模型进行 500 次预测
for _ in range(500):
    tokens_tensor = tokens_tensor.to('cuda')

    with torch.no_grad():
        outputs = model(tokens_tensor)
        predictions = outputs[0]

    predicted_index = select_top_k(predictions, k=10)

    predicted_text = tokenizer.decode(indexed_tokens + [predicted_index])
    total_predicted_text += tokenizer.decode(predicted_index)
    if '<|endoftext|>' in total_predicted_text:
        # 如果出现文本结束标志，就结束文本生成
        break

    indexed_tokens += [predicted_index]

    if len(indexed_tokens) > 1023:
        # 模型最长输入长度为1024，如果长度过长则截断
        indexed_tokens = indexed_tokens[-1023:]

    tokens_tensor = torch.tensor([indexed_tokens])

print(total_predicted_text)

秦羽从府邸中出来，急速向南飞去��������������� �������������������������������。������������。大�����������������������。
�����������������的����������������一�����������上���������������������不����……�������一����������生。��������大������一����之���������三�����������天天��������天生��������……��������大�����一�����������������……�������一�����������������������������的�����������������……

------------------------


�������~~~~���������之������~~��������一�����������������������������������������之����人��������~~ST�����������……
��������������一�������的����

��

�� ��������不��

�����一�������
