# 1 字节对编码

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
pwd

'/content'

In [3]:
cd /content/drive/My Drive/my_code/GPT2/GPT2-and-text-generation

/content/drive/My Drive/my_code/GPT2/GPT2-and-text-generation


In [4]:
ls

config.json                     pytorch_model.bin
GPT2-and-text-generation.ipynb  romeo_and_juliet.txt


In [5]:
import re
import collections


def get_stats(vocab):
    pairs = collections.defaultdict(int)
    for word, freq in vocab.items():
        symbols = word.split()
        for i in range(len(symbols)-1):
            pairs[symbols[i], symbols[i+1]] += freq  # 计算字节对出现频率
    return pairs


def merge_vocab(pair, v_in):
    v_out = {}
    bigram = re.escape(' '.join(pair))  # 将字节对中可解释为正则运算符的字符转义
    p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')  # 将要合并的字节对前后只能为空白字符
    for word in v_in:
        w_out = p.sub(''.join(pair), word)  # 合并符合条件的字节对
        v_out[w_out] = v_in[word]
    return v_out

In [6]:
vocab = {'l o w </w>': 5, 'l o w e r </w>': 2,
         'n e w e s t </w>': 6, 'w i d e s t </w>': 3}


In [7]:
get_stats(vocab)

defaultdict(int,
            {('d', 'e'): 3,
             ('e', 'r'): 2,
             ('e', 's'): 9,
             ('e', 'w'): 6,
             ('i', 'd'): 3,
             ('l', 'o'): 7,
             ('n', 'e'): 6,
             ('o', 'w'): 7,
             ('r', '</w>'): 2,
             ('s', 't'): 9,
             ('t', '</w>'): 9,
             ('w', '</w>'): 5,
             ('w', 'e'): 8,
             ('w', 'i'): 3})

In [8]:
num_merges = 10
for i in range(num_merges):
    pairs = get_stats(vocab)
    best = max(pairs, key=pairs.get)  # 选择频率最大的字节对
    vocab = merge_vocab(best, vocab)
    print(best)

('e', 's')
('es', 't')
('est', '</w>')
('l', 'o')
('lo', 'w')
('n', 'e')
('ne', 'w')
('new', 'est</w>')
('low', '</w>')
('w', 'i')


In [9]:
vocab

{'low e r </w>': 2, 'low</w>': 5, 'newest</w>': 6, 'wi d est</w>': 3}

# 2 top-k 实现

In [10]:
import random

def select_top_k(predictions, k=10):
    predicted_index = random.choice(
        predictions[0, -1, :].sort(descending=True)[1][:10]).item()
    return predicted_index

# 3 预训练模型生成新闻

In [11]:
import torch
from pytorch_transformers import GPT2Tokenizer

import logging
logging.basicConfig(level=logging.INFO)

In [12]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

INFO:pytorch_transformers.file_utils:https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json not found in cache or force_download set to True, downloading to /tmp/tmprrs3urgt
100%|██████████| 1042301/1042301 [00:00<00:00, 7066834.01B/s]
INFO:pytorch_transformers.file_utils:copying /tmp/tmprrs3urgt to cache at /root/.cache/torch/pytorch_transformers/f2808208f9bec2320371a9f5f891c184ae0b674ef866b79c58177067d15732dd.1512018be4ba4e8726e41b9145129dc30651ea4fec86aa61f4b9f40bf94eac71
INFO:pytorch_transformers.file_utils:creating metadata file for /root/.cache/torch/pytorch_transformers/f2808208f9bec2320371a9f5f891c184ae0b674ef866b79c58177067d15732dd.1512018be4ba4e8726e41b9145129dc30651ea4fec86aa61f4b9f40bf94eac71
INFO:pytorch_transformers.file_utils:removing temp file /tmp/tmprrs3urgt
INFO:pytorch_transformers.file_utils:https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt not found in cache or force_download set to True, downloading to /tmp/tmpaaib9bsy
100%|████████

In [13]:
text = "guobing, you are so fat,"
indexed_tokens = tokenizer.encode(text)
indexed_tokens

[915, 672, 278, 11, 345, 389, 523, 3735, 11]

In [14]:
tokens_tensor = torch.tensor([indexed_tokens])
tokens_tensor.shape

torch.Size([1, 9])

In [15]:
from pytorch_transformers import GPT2LMHeadModel

# 读取 GPT-2 预训练模型
model = GPT2LMHeadModel.from_pretrained("./")

INFO:pytorch_transformers.modeling_utils:loading configuration file ./config.json
INFO:pytorch_transformers.modeling_utils:Model config {
  "attn_pdrop": 0.1,
  "embd_pdrop": 0.1,
  "finetuning_task": null,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_layer": 12,
  "n_positions": 1024,
  "num_labels": 1,
  "output_attentions": false,
  "output_hidden_states": false,
  "pruned_heads": {},
  "resid_pdrop": 0.1,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "torchscript": false,
  "vocab_size": 50257
}

INFO:pytorch_transformers.modeling_utils:loading weights file ./pytorch_model.bin


In [16]:
model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): Laye

In [17]:
tokens_tensor.shape

torch.Size([1, 9])

In [18]:
text = "guobing, you are so fat,"

In [19]:
total_predicted_text = text
n = 100  # 预测过程的循环次数
for _ in range(n):
    with torch.no_grad():
        outputs = model(tokens_tensor)
        predictions = outputs[0]

    predicted_index = select_top_k(predictions, k=10)
    predicted_text = tokenizer.decode(indexed_tokens + [predicted_index])
    total_predicted_text += tokenizer.decode(predicted_index)

    if '<|endoftext|>' in total_predicted_text:
        # 如果出现文本结束标志，就结束文本生成
        break

    indexed_tokens += [predicted_index]
    tokens_tensor = torch.tensor([indexed_tokens])

print(total_predicted_text)

guobing, you are so fat, that it takes a bit. So, I am very lucky that this year has had this big and wonderful opportunity, which means so I can take the risk with you." The first few hours on the bike are hard and hard but the second is just too long so it doesn`nt feel like there has even been anything on. "When a big mountain is coming to a place, we need to go. We don `d go out with no bikes at the top, we just want it there


In [20]:
with open('./romeo_and_juliet.txt', 'r') as f:
    dataset = f.read()

len(dataset)

138150

In [21]:
indexed_text = tokenizer.encode(dataset)
del(dataset)

dataset_cut = []
for i in range(len(indexed_text)//512):
    # 将字符串分段成长度为 512
    dataset_cut.append(indexed_text[i*512:i*512+512])
del(indexed_text)

dataset_tensor = torch.tensor(dataset_cut)
dataset_tensor.shape



torch.Size([81, 512])

In [22]:
from torch.utils.data import DataLoader, TensorDataset

# 构建数据集和数据迭代器，设定 batch_size 大小为 2
train_set = TensorDataset(dataset_tensor,
                          dataset_tensor)  # 标签与样本数据相同
train_loader = DataLoader(dataset=train_set,
                          batch_size=2,
                          shuffle=False)
train_loader

<torch.utils.data.dataloader.DataLoader at 0x7fbffb389e10>

In [23]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [24]:
from torch import nn
from torch.autograd import Variable
import time

pre = time.time()

epoch = 30  # 循环学习 30 次

model.to(device)
model.train()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)  # 定义优化器

for i in range(epoch):
    total_loss = 0
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = Variable(data).to(device), Variable(
            target).to(device)

        optimizer.zero_grad()

        loss, logits, _ = model(data, labels=target)

        total_loss += loss

        loss.backward()
        optimizer.step()

        if batch_idx == len(train_loader)-1:
            # 在每个 Epoch 的最后输出一下结果
            print('average loss:', total_loss/len(train_loader))

print('训练时间：', time.time()-pre)

average loss: tensor(4.0316, device='cuda:0', grad_fn=<DivBackward0>)
average loss: tensor(3.8470, device='cuda:0', grad_fn=<DivBackward0>)
average loss: tensor(3.7315, device='cuda:0', grad_fn=<DivBackward0>)
average loss: tensor(3.6476, device='cuda:0', grad_fn=<DivBackward0>)
average loss: tensor(3.5738, device='cuda:0', grad_fn=<DivBackward0>)
average loss: tensor(3.5079, device='cuda:0', grad_fn=<DivBackward0>)
average loss: tensor(3.4507, device='cuda:0', grad_fn=<DivBackward0>)
average loss: tensor(3.3989, device='cuda:0', grad_fn=<DivBackward0>)
average loss: tensor(3.3539, device='cuda:0', grad_fn=<DivBackward0>)
average loss: tensor(3.3105, device='cuda:0', grad_fn=<DivBackward0>)
average loss: tensor(3.2676, device='cuda:0', grad_fn=<DivBackward0>)
average loss: tensor(3.2294, device='cuda:0', grad_fn=<DivBackward0>)
average loss: tensor(3.1954, device='cuda:0', grad_fn=<DivBackward0>)
average loss: tensor(3.1571, device='cuda:0', grad_fn=<DivBackward0>)
average loss: tensor

In [25]:
text = "From fairest creatures we desire"  # 这里也可以输入不同的英文文本
indexed_tokens = tokenizer.encode(text)
tokens_tensor = torch.tensor([indexed_tokens])

In [26]:
model.eval()
total_predicted_text = text

# 使训练后的模型进行 500 次预测
for _ in range(500):
    tokens_tensor = tokens_tensor.to('cuda')

    with torch.no_grad():
        outputs = model(tokens_tensor)
        predictions = outputs[0]

    predicted_index = select_top_k(predictions, k=10)

    predicted_text = tokenizer.decode(indexed_tokens + [predicted_index])
    total_predicted_text += tokenizer.decode(predicted_index)
    if '<|endoftext|>' in total_predicted_text:
        # 如果出现文本结束标志，就结束文本生成
        break

    indexed_tokens += [predicted_index]

    if len(indexed_tokens) > 1023:
        # 模型最长输入长度为1024，如果长度过长则截断
        indexed_tokens = indexed_tokens[-1023:]

    tokens_tensor = torch.tensor([indexed_tokens])

print(total_predicted_text)

From fairest creatures we desire: let it, and it not stand alone:
And if it please the eyes of mortals, we have but the bare
wood that they can use. Thus I'll set up our
assembly; for this place lies so high above all other encampment in Tyne
That if men do wrong and trespass, it may afford remedy, and death: hence we encamp!
ROMAN'S LA
I'll not leave you here in thy bed: I'll bring thy sword and arrowhead; they're not yet
done, but you may be well, as thou livest, to fight for these murders: hence forth thou art done with thimble
to-dower; the poison I gave my master in this desperate state is to kill thyself; thou shalt have no need for him:
But go along hence; I must bring him back to Tyche. He will die young: for that I am,
you must take it from the morgol. Romeo will have thy blade with her. What? I have not kill thee yet, for that will bring thee
some pleasure. O God I love you--love--love!--you have slain this tyrant and Romeo that thou hast slain with
such fury and murder that 

In [27]:
text = "guobing, you are so fat,"
indexed_tokens = tokenizer.encode(text)
tokens_tensor = torch.tensor([indexed_tokens])
model.eval()
total_predicted_text = text

# 使训练后的模型进行 500 次预测
for _ in range(500):
    tokens_tensor = tokens_tensor.to('cuda')

    with torch.no_grad():
        outputs = model(tokens_tensor)
        predictions = outputs[0]

    predicted_index = select_top_k(predictions, k=10)

    predicted_text = tokenizer.decode(indexed_tokens + [predicted_index])
    total_predicted_text += tokenizer.decode(predicted_index)
    if '<|endoftext|>' in total_predicted_text:
        # 如果出现文本结束标志，就结束文本生成
        break

    indexed_tokens += [predicted_index]

    if len(indexed_tokens) > 1023:
        # 模型最长输入长度为1024，如果长度过长则截断
        indexed_tokens = indexed_tokens[-1023:]

    tokens_tensor = torch.tensor([indexed_tokens])

print(total_predicted_text)

guobing, you are so fat, O me. Come now to my chamber; where is Romeo to lie! O Juliet--Hush; she will give thee her rest: go to Montag:
O Montignac; there lies Romeo. O mad Juliet, what art's her here--
Where art these poor little friars?
'Alas!' quears the friAR! I see you must have an eye, O Romeo; I
see that Juliet bears such fruit, O mad Paris!' Come! what an ill humour this Romeo
wakes you! Come thou now hither: come I now, nurse--
O, my head is heavy--Come now; come hither again:' Come thou again: come, go! '
CAPulete cries out again--CAPULEt
Where have the Montserrel come to die? come now again, and stay: Juliet, I must kill myself
--I'll, kill you all--but, my heart--Go. Juliet rushes in: Juliet cries
And comes, O good nurse. 'Come now! my heart!' and 'O happy day is upon her; go to her. I see--she
will look after thee,--
Herself, my love--'Ah! what is her name again'?--Hail me, Romeo!'
O Juliet,' say ye: I am so ill-wisher--'Al as ill! O my head--I must die!' I am gone--I com