# 准备数据集

我们将《三国演义》的原文作为数据集, 来训练一个字符级别的语言模型. 也就是将原文中的汉字以及标点符号等等映射成整型(`int`).

In [88]:
# 导入需要的包
import pickle
import numpy as np

In [89]:
# 打开《三国演义》的文本文件`input.txt`
# 然后读取
with open('input.txt', 'r') as f:
    data = f.read()

# 打印《三国演义》中的字符数量
print(f"字符数据集的长度: {len(data):,}")

字符数据集的长度: 605,548


In [90]:
# 计算《三国演义》中有多少个不同的字符

# set去重 --> list转换成列表 --> 排序
chars = sorted(list(set(data)))
# 不同字符的数量
vocab_size = len(chars)

print("所有不同的字符:", ''.join(chars))
print(f"不同字符数量: {vocab_size:,}")

所有不同的字符: 
 <>[]—‘’“”…□　、。《》【】一丁七万丈三上下不与丐丑专且丕世丘丙业丛东丝丞丢两严丧个中丰临丸丹为主丽举乂乃久么义之乌乎乏乐乔乖乘乙九乞也习乡书买乱乳乾了予争事二于亏云互五井亘亚些亟亡亢交亥亦产亨亩享京亭亮亲亵亹人什仁仅仆仇今介仍从仓仔仕他仗付仙仞代令以仪们仰仲件价任仿伉伊伍伎伏伐休众优伙会伞伟传伤伦伪伯伴伷伸伺似但位低住佐佑体何佗余佛作佞你佣佥佩佯佳佻使侄侈例侍供依侠侥侧侪侮侯侵便促俄俊俎俗俘保俞俟信俦俨俭修俯俱俸俺俾倅倍倏倒倘候倚借倡倥倦值倾偃假偎偏偕做停健偬偶偷偿傅傍傕储催傲像僚僧僭僮僵僻儁儒儿兀允元兄充兆先光克免兔兖党兜兢入全八公六兮兰共关兴兵其具典兹养兼兽冀内冈册再冒冓冕冗写军农冠冢冤冥冬冯冰冲决况冶冷冻净凄准凉凋凌减凑凛凝几凡凤凭凯凰凳凶凹出击函凿刀刁刃分切刈刎刑划刖列刘则刚创初判利别刮到制刺刻刽剁剂削前剐剑剔剖剜剥剧剩剪副割剽剿劈力劝办功加务劣动助努劫劬劭励劲劳劾势勃勇勉勋勑勒勖勘募勤勺勾勿匄包匆匍匐化北匙匝匠匡匣匪匮匹区医匿十千升午半华协卑卒卓单卖南博卜卞占卢卣卤卦卧卫卯印危即却卵卷卸卿厄厅历厉压厌厔厕厘厚原厢厥厦厨厮去县参又叉及友双反发叔取受变叙叛叟叠口古句另叨叩只叫召叮可台叱史右叵叶号司叹吁吃各合吉吊同名后吏吐向吓吕君吝吞吟吠否含听启吴吸吹吻吼吾呀呆呈告呐呕员呜呦周味呵呻呼命咆和咎咏咐咒咛咥咨咫咬咸咽哀品哂哄哉响哑哙哥哨哩哭哮哲哺哽唆唇唐唤唬唯唱唾唿商啕啖啜啸啼喂喃善喈喉喊喏喘喜喝喟喧喨喷喻嗓嗔嗜嗟嗣嗤嘉嘏嘤嘱嘴嘶嘹噀噎噤器噪噫噬嚎嚷嚼囊囚四回因团囧园困围囷固国图圃圆圈土圣在圭地场坂均坊坌坎坏坐坑块坚坛坞坟坠坡坤坦垂垒垓垕垛垠垢垣垦垫埃埋城域基堂堆堑堕堤堪堰堵塌塑塔塘塞填墀境墉墓墙增墟墨墩墵壁壎壑壕壤士壬壮声壳壶处备复夏夔夕外夙多夜够夤夥大天太夫夭央失头夷夸夹夺奁奂奄奇奈奉奋奎奏契奔奕奖套奚奠奢奥女奴奸好如妃妄妆妇妒妓妖妙妥妨妫妹妻妾姊始姐姑姓委姚姜姬姻姿威娄娇娘娥娩娱娴娶娼婆婉婚婢婴婿媒媚嫁嫂嫉嫌嫔嫡嫩嬉嬖嬴子孑孔孕字存孙孚孝孟季孤孥学孩孰孱孺孽宁宄宅宇守安宋完宏宓宕宗官宙定宛宜宝实宠审客宣室宥宦宪宫宰害宴宵家容宽宾宿寂寄寅密寇富寐寒寓寔寝寞察寡寤寨寮寰寸对寺寻导寿封射将尉尊小少尔尖尘尚尝尤尧尪就尸尹尺尼尽尾局层居屈屋屏屑展属屠屡履屦屯山岁岂岌岐岑岖岗岘岛岩岭岱岳岷岸峙峡峨峪峭峰峻崇崎崔崖崤崦崩嵋嵌嵩嵯嶲嶷巅巍川州巡

In [91]:
# 创建从字符到整数的映射

# 从字符到整数的映射字典
stoi = { ch:i for i,ch in enumerate(chars) }

# 从整数到字符的映射字典
itos = { i:ch for i,ch in enumerate(chars) }

# 例如我们可以看一下`鼻`这个字对应的整数
print(stoi['鼻'])

3934


In [92]:
# 给定一个字符串`s`, 输入字符串中每个字对应的整数组成的列表
def encode(s):
    return [stoi[c] for c in s]

# 给定一个整数列表, 返回列表中每个整数对应的字符所组成的字符串
def decode(l):
    return ''.join([itos[i] for i in l])

# 测试一下
print(encode('滚滚长江东逝水'))
print(decode([2066, 2066, 3623, 1903, 62, 3452, 1893]))

[2044, 2044, 3600, 1881, 40, 3429, 1871]
潼潼阊没之遣沆


In [93]:
# 切分数据集
# 将《三国演义》前90%的文字作为训练数据集
n = len(data)
train_data = data[:int(n*0.9)]
# 将《三国演义》后10%的文字作为验证数据集
val_data = data[int(n*0.9):]

In [94]:
# 分别将训练数据集中的字符和验证数据集中的字符编码成整数
train_ids = encode(train_data)
val_ids = encode(val_data)

print(f"训练数据集中有 {len(train_ids):,} 个字符(token)")
print(f"验证数据集中有 {len(val_ids):,} 个字符(token)")

训练数据集中有 544,993 个字符(token)
验证数据集中有 60,555 个字符(token)


In [95]:
# 将训练数据集和验证数据集分别保存成二进制文件
train_ids = np.array(train_ids, dtype=np.uint16)
val_ids = np.array(val_ids, dtype=np.uint16)
train_ids.tofile('train.bin')
val_ids.tofile('val.bin')

In [96]:
# 将元数据保存成pickle格式的文件, 供我们后面在encode或者decode时使用
meta = {
    'vocab_size': vocab_size,
    'itos': itos,
    'stoi': stoi,
}
with open('meta.pkl', 'wb') as f:
    pickle.dump(meta, f)

我们数据准备的工作就完成了.

# 编写GPT模型

接下来我们开始编写模型代码

In [97]:
# 首先导入需要的一些包
import math
import inspect
from dataclasses import dataclass

import torch
import torch.nn as nn
from torch.nn import functional as F

## GeLU激活函数

公式如下:

In [98]:
# 定义GELU激活函数, 具体论文参见:
# https://arxiv.org/abs/1606.08415
def new_gelu(x):
    return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))

## 层归一化模块

In [99]:
# 定义层归一化模块
class LayerNorm(nn.Module):
    def __init__(self, ndim, bias):
        super().__init__()
        self.weight = nn.Parameter(torch.ones(ndim))
        self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None
    
    def forward(self, input):
        return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5)

## 因果自注意力机制模块

In [100]:
class CausalSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 确保词嵌入向量的维度是head数量的整数倍
        assert config.n_embd % config.n_head == 0
        # 下面的线性变换计算的是:
        # 在一批中, 所有头的key, query, value的投影
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
        # 下面的线性变换的作用是将投影输出
        self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
        # head的数量
        self.n_head = config.n_head
        # 词嵌入向量的维度
        self.n_embd = config.n_embd
        
    def forward(self, x):
        """
        定义因果自注意力模块在接收到张量x时, 输出什么样的张量
        """
        B, T, C = x.size() # 批的大小(batch size), 序列长度(sequence length), 词嵌入向量维度(n_embd)
        
        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
        q, k ,v  = self.c_attn(x).split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        
        y = F.scaled_dot_product_attention(q, k, v, attn_mask=None, is_causal=True)
        
        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
        
        return self.c_proj(y)

## 多层感知机模块

In [101]:
class MLP(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.c_fc    = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)
        self.c_proj  = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)

    def forward(self, x):
        x = self.c_fc(x)
        x = new_gelu(x)
        x = self.c_proj(x)
        return x

## Block模块

![](assets/Block模块示意图.svg)

In [102]:
class Block(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.ln_1 = LayerNorm(config.n_embd, bias=config.bias)
        self.attn = CausalSelfAttention(config)
        self.ln_2 = LayerNorm(config.n_embd, bias=config.bias)
        self.mlp = MLP(config)

    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x

## GPT模型的一些参数配置

In [103]:
@dataclass
class GPTConfig:
    block_size: int = 64
    vocab_size: int = 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
    n_layer: int = 4
    n_head: int = 4
    n_embd: int = 128
    bias: bool = False # 不使用偏置

## GPT模型的实现

In [130]:
class GPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        assert config.vocab_size is not None
        assert config.block_size is not None
        self.config = config
        
        # Transformer模块
        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd),
            wpe = nn.Embedding(config.block_size, config.n_embd),
            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            ln_f = LayerNorm(config.n_embd, bias=config.bias),
        ))
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        self.transformer.wte.weight = self.lm_head.weight
        
        # 初始化所有权重
        self.apply(self._init_weights)
        
        # 打印模型的参数数量
        print("参数数量: %.2fM" % (self.get_num_params() / 1e6,))
        
    def get_num_params(self):
        n_params = sum(p.numel() for p in self.parameters())
        n_params -= self.transformer.wpe.weight.numel()
        return n_params
    
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
            elif isinstance(module, nn.Embedding):
                torch.nn.init.normal_(module.weight, mean=0.0, std=0.2)
                
    def forward(self, idx, targets=None):
        device = idx.device
        b, t = idx.size()
        assert t <= self.config.block_size, f"无法前馈(向前发送)序列长度: {t}, 因为block size只有{self.config.block_size}"
        pos = torch.arange(0, t, dtype=torch.long, device=device).unsqueeze(0) # shape (1, t)
        
        tok_emb = self.transformer.wte(idx) # token嵌入向量的形状 (b, t, n_embd)
        pos_emb = self.transformer.wpe(pos) # 位置嵌入向量的形状 (1, t, n_embd)
        x = tok_emb + pos_emb
        
        for block in self.transformer.h:
            x = block(x)
        x = self.transformer.ln_f(x)
        
        logits = self.lm_head(x)
        loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
        
        return logits, loss
    
    def configure_optimizers(self, weight_decay, learning_rate, betas, device_type):
        """
        This long function is unfortunately doing something very simple and is being very defensive:
        We are separating out all parameters of the model into two buckets: those that will experience
        weight decay for regularization and those that won't (biases, and layernorm/embedding weights).
        We are then returning the PyTorch optimizer object.
        """

        # separate out all parameters to those that will and won't experience regularizing weight decay
        decay = set()
        no_decay = set()
        whitelist_weight_modules = (torch.nn.Linear, )
        blacklist_weight_modules = (torch.nn.LayerNorm, LayerNorm, torch.nn.Embedding)
        for mn, m in self.named_modules():
            for pn, p in m.named_parameters():
                fpn = '%s.%s' % (mn, pn) if mn else pn # full param name
                # random note: because named_modules and named_parameters are recursive
                # we will see the same tensors p many many times. but doing it this way
                # allows us to know which parent module any tensor p belongs to...
                if pn.endswith('bias'):
                    # all biases will not be decayed
                    no_decay.add(fpn)
                elif pn.endswith('weight') and isinstance(m, whitelist_weight_modules):
                    # weights of whitelist modules will be weight decayed
                    decay.add(fpn)
                elif pn.endswith('weight') and isinstance(m, blacklist_weight_modules):
                    # weights of blacklist modules will NOT be weight decayed
                    no_decay.add(fpn)

        # subtle: 'transformer.wte.weight' and 'lm_head.weight' are tied, so they
        # will appear in the no_decay and decay sets respectively after the above.
        # In addition, because named_parameters() doesn't return duplicates, it
        # will only return the first occurence, key'd by 'transformer.wte.weight', below.
        # so let's manually remove 'lm_head.weight' from decay set. This will include
        # this tensor into optimization via transformer.wte.weight only, and not decayed.
        decay.remove('lm_head.weight')

        # validate that we considered every parameter
        param_dict = {pn: p for pn, p in self.named_parameters()}
        inter_params = decay & no_decay
        union_params = decay | no_decay
        assert len(inter_params) == 0, "parameters %s made it into both decay/no_decay sets!" % (str(inter_params), )
        assert len(param_dict.keys() - union_params) == 0, "parameters %s were not separated into either decay/no_decay set!" \
                                                    % (str(param_dict.keys() - union_params), )

        # create the pytorch optimizer object
        optim_groups = [
            {"params": [param_dict[pn] for pn in sorted(list(decay))], "weight_decay": weight_decay},
            {"params": [param_dict[pn] for pn in sorted(list(no_decay))], "weight_decay": 0.0},
        ]
        # new PyTorch nightly has a new 'fused' option for AdamW that is much faster
        use_fused = (device_type == 'cuda') and ('fused' in inspect.signature(torch.optim.AdamW).parameters)
        print(f"using fused AdamW: {use_fused}")
        extra_args = dict(fused=True) if use_fused else dict()
        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas, **extra_args)

        return optimizer

# 训练模型

由于配置问题, 我们使用CPU来训练GPT模型.

In [137]:
from contextlib import nullcontext
import pickle
import numpy as np
import torch

# 选择一个随机种子
torch.manual_seed(1337)
# 设备类型
device = 'cpu'
block_size = 64
batch_size = 12
# adamw optimizer
learning_rate = 6e-4 # max learning rate
weight_decay = 1e-1
beta1 = 0.9
beta2 = 0.95
# learning rate decay settings
decay_lr = True # whether to decay the learning rate
warmup_iters = 2000 # how many steps to warm up for
lr_decay_iters = 600000 # should be ~= max_iters per Chinchilla
always_save_checkpoint = True # if True, always save a checkpoint after each eval
# 初始化一个空的上下文
ctx = nullcontext()

# 加载训练数据集和验证数据集
train_data = np.memmap('train.bin', dtype=np.uint16, mode='r')
val_data = np.memmap('val.bin', dtype=np.uint16, mode='r')

In [138]:
# 定义切分数据集为输入和标签的函数
def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in ix])
    y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in ix])
    
    x, y = x.to(device), y.to(device)
    
    return x, y

In [139]:
iter_num = 0 # 迭代次数
best_val_loss = 1e9 # 最佳损失

with open('meta.pkl', 'rb') as f:
    meta = pickle.load(f)

meta_vocab_size = meta['vocab_size']
print(f"found vocab_size = {meta_vocab_size} (inside 'meta.pkl')")

found vocab_size = 3951 (inside 'meta.pkl')


In [140]:
# 从头开始训练模型
print("从头开始训练模型")
gptconf = GPTConfig()
gptconf.vocab_size = meta_vocab_size
print(gptconf)
model = GPT(gptconf)
model.to(device)
# optimizer
optimizer = model.configure_optimizers(weight_decay, learning_rate, (beta1, beta2), device)

X, Y = get_batch('train') # fetch the very first batch
print(decode(X[1].tolist()))
print(decode(Y[1].tolist()))

从头开始训练模型
GPTConfig(block_size=64, vocab_size=3951, n_layer=4, n_head=4, n_embd=128, bias=False)
参数数量: 1.29M
using fused AdamW: False
可不惧乎？愿君侯裒多益寡，非礼勿履：然后三公可至，青蝇可驱也。”邓飏怒曰：“此老生之常谈耳！”辂曰：“老生者见不生，常谈者见不谈
不惧乎？愿君侯裒多益寡，非礼勿履：然后三公可至，青蝇可驱也。”邓飏怒曰：“此老生之常谈耳！”辂曰：“老生者见不生，常谈者见不谈。


In [141]:
# 估算损失
# helps estimate an arbitrarily accurate loss over either split using many batches
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            with ctx:
                logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [142]:
# learning rate decay scheduler (cosine with warmup)
def get_lr(it):
    # 1) linear warmup for warmup_iters steps
    if it < warmup_iters:
        return learning_rate * it / warmup_iters
    # 2) if it > lr_decay_iters, return min learning rate
    if it > lr_decay_iters:
        return min_lr
    # 3) in between, use cosine decay down to min learning rate
    decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) # coeff ranges 0..1
    return min_lr + coeff * (learning_rate - min_lr)

In [None]:
t0 = time.time()
local_iter_num = 0 # number of iterations in the lifetime of this process

# 训练代码
while True:

    # determine and set the learning rate for this iteration
    lr = get_lr(iter_num) if decay_lr else learning_rate
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

    # evaluate the loss on train/val sets and write checkpoints
    if iter_num % eval_interval == 0:
        losses = estimate_loss()
        print(f"step {iter_num}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
        if losses['val'] < best_val_loss or always_save_checkpoint:
            best_val_loss = losses['val']
            if iter_num > 0:
                checkpoint = {
                    'model': model.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'model_args': model_args,
                    'iter_num': iter_num,
                    'best_val_loss': best_val_loss,
                    'config': config,
                }
                print(f"将模型的检查点文件保存到 {out_dir}")
                torch.save(checkpoint, 'ckpt.pt'))
    if iter_num == 0 and eval_only:
        break

    # forward backward update, with optional gradient accumulation to simulate larger batch size
    # and using the GradScaler if data type is float16
    for micro_step in range(gradient_accumulation_steps):
        with ctx:
            logits, loss = model(X, Y)
        # immediately async prefetch next batch while model is doing the forward pass on the GPU
        X, Y = get_batch('train')
        # backward pass, with gradient scaling if training in fp16
        scaler.scale(loss).backward()
    # clip the gradient
    if grad_clip != 0.0:
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
    # step the optimizer and scaler if training in fp16
    scaler.step(optimizer)
    scaler.update()
    # flush the gradients as soon as we can, no need for this memory anymore
    optimizer.zero_grad(set_to_none=True)

    # timing and logging
    t1 = time.time()
    dt = t1 - t0
    t0 = t1
    if iter_num % log_interval == 0:
        lossf = loss.item() # loss as float. note: this is a CPU-GPU sync point
        print(f"iter {iter_num}: loss {lossf:.4f}, time {dt*1000:.2f}ms, mfu {running_mfu*100:.2f}%")
    iter_num += 1
    local_iter_num += 1

    # termination conditions
    if iter_num > max_iters:
        break