In [None]:
from adapters import *

# 参数

In [None]:
vocab_size=8000
context_length=256
d_model=768
d_ff=3072
theta=1000
num_layers=6
num_heads=12
device=torch.device('cuda')
dtype=torch.bfloat16
batch_size=32
lr=3e-4
weight_decay=0.01
epoch=1000
grad_clip=1.0


# 读取文本并生成BPEtokenizer

In [None]:
text_path='shakespear.txt'
with open(text_path,'r',encoding='utf-8') as f:
    text_example=f.read()
vocab,merge=run_train_bpe(text_path,vocab_size,['<|endoftext|>'])

# 生成训练数据

In [None]:
write_vocab_merges(vocab,merge,'BPE')
tokenizer=get_tokenizer(vocab,merge,['<|endoftext|>'])
np_lst=np.array(tokenizer.encode(text_example))

# 生成模型和优化器

In [None]:
model=TransformerLM(vocab_size,d_model,num_layers,num_heads,d_ff,context_length,theta,device,dtype)
AdamW=get_adamw_cls()
optimizer=AdamW(params=model.parameters(),weight_decay=weight_decay)

# 模型训练

In [None]:
#faster version
model.train()
for i in range(epoch):
    input,label=run_get_batch(np_lst,batch_size,context_length,device)
    optimizer.zero_grad()
    output=model(input)
    loss=run_cross_entropy(output.reshape(-1,vocab_size),label.reshape(-1))
    loss.backward()
    run_gradient_clipping(model.parameters(),grad_clip)
    optimizer.step()
    if (i+1)%100==0:
        print(f'train {i+1} times loss: {loss}')
        run_save_checkpoint(model,optimizer,i//100,'state_dict.pt')



In [None]:
#better version
model.train()
max_learning_rate = 3e-4    # 最大学习率（适配d_model=768的模型）
min_learning_rate = 1e-5    # 最小学习率（防止学习率过低导致停止更新）
warmup_iters = 1000         # 预热迭代数（前1000步线性升温）
cosine_cycle_iters = 50000  # 余弦退火总迭代数（预热后到50000步完成退火）
grad_clip = 1.0             # 梯度裁剪阈值
epoch = 50000      
for i in range(epoch):
    input, label = run_get_batch(np_lst, batch_size, context_length, device)
    current_lr = run_get_lr_cosine_schedule(
        it=i,
        max_learning_rate=max_learning_rate,
        min_learning_rate=min_learning_rate,
        warmup_iters=warmup_iters,
        cosine_cycle_iters=cosine_cycle_iters
    )
    for param_group in optimizer.param_groups:
        param_group['lr'] = current_lr
    optimizer.zero_grad()
    output = model(input)
    loss = run_cross_entropy(output.reshape(-1, vocab_size), label.reshape(-1))
    loss.backward()
    run_gradient_clipping(model.parameters(), grad_clip)
    optimizer.step()
    if (i + 1) % 100 == 0:
        print(f'Iteration {i+1} | Loss: {loss.item():.4f} | Current LR: {current_lr:.6f}')
        run_save_checkpoint(model, optimizer, (i+1)//100, 'state_dict.pt')

