In [23]:
import torch
from torch import nn
from GPT import GPT
import tiktoken
from torch.nn import functional as F

In [13]:
GPT_CONFIG_124M = { 
 "vocab_size": 50257, 
 "context_length": 256, 
 "emb_dim": 768, 
 "n_heads": 12, 
 "n_layers": 12, 
 "drop_rate": 0.1, 
 "qkv_bias": False 
}
torch.manual_seed(123) 
model = GPT(GPT_CONFIG_124M) 
model.eval()

GPT(
  (token_embed): Embedding(50257, 768)
  (pos_embed): Embedding(256, 768)
  (dropout): Dropout(p=0.1, inplace=False)
  (block): Sequential(
    (0): TransformBlock(
      (att): MultiHeadAttention(
        (wq): Linear(in_features=768, out_features=768, bias=False)
        (wk): Linear(in_features=768, out_features=768, bias=False)
        (wv): Linear(in_features=768, out_features=768, bias=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ffn): Sequential(
        (0): Linear(in_features=768, out_features=3072, bias=True)
        (1): GELU(approximate='none')
        (2): Linear(in_features=3072, out_features=768, bias=True)
      )
      (LayerNorm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (LayerNorm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (1): TransformBlock(
      (att): MultiHeadAttention(

### GPT生成文本

In [15]:
def generate_text_simple(model, idx, max_text, context_size):
    for _ in range(max_text):
        idx = idx[:, -context_size:]
        with torch.no_grad():
            logits = model(idx)
        logits = logits[:, -1, :]
        idx_text = torch.softmax(logits, dim = -1)
        idx_next = torch.argmax(idx_text, dim = -1, keepdim=True)
        idx = torch.cat((idx, idx_next), dim = -1)
    return idx

In [19]:
def text_to_id(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0)
    return encoded_tensor

In [17]:
def id_to_text(id, tokenizer):
    flat = id.squeeze(0)
    return tokenizer.decode(flat.tolist())

In [21]:
start_context = "Every effort moves you" 
tokenizer = tiktoken.get_encoding("gpt2")

token_ids = generate_text_simple( 
 model=model, 
 idx=text_to_id(start_context, tokenizer), 
 max_text=10, 
 context_size=GPT_CONFIG_124M["context_length"] 
) 
print("Output text:\n", id_to_text(token_ids, tokenizer))

Output text:
 Every effort moves you rentingetic wasnم refres RexAngel infieldcigans


#### 计算文本生成损失

#### 使用交叉熵随时，来衡量两个概率分布之间的差异

In [None]:
F.cross_entropy()

### 计算训练集和验证集的损失

#### 单个批次的损失

In [24]:
def calc_loss_batch(inputs, targets, model, device):
    inputs = inputs.to(device)
    targets = targets.to(device)
    logits = model(inputs)
    loss = F.cross_entropy(logits.flatten(0, 1), targets.flatten())
    return loss 

#### 数据加载器采样的所有批次的损失

In [25]:
def clac_loss_loader(data_loader, model, device, num_batches=None):
    total_loss = 0
    if len(data_loader) == 0:
        return float("nan")
    elif num_batches is None:
        num_batches = len(data_loader)
    else:
        num_batches = min(num_batches, len(data_loader))
    for i, (inputs, targets) in enumerate(data_loader):
        if i < num_batches:
            loss = calc_loss_batch(inputs, targets, model, device)
            total_loss += loss.item()
        else:
            break
    return total_loss / num_batches