In [1]:
import os
os.environ['http_proxy'] = 'http://127.0.0.1:7890'
os.environ['https_proxy'] = 'http://127.0.0.1:7890'

In [2]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# 加载预训练模型和分词器
model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')



In [3]:
# 编码初始输入
input_ids = tokenizer.encode("Hello, my name is", return_tensors='pt')

# batch_size, seq_len
input_ids, input_ids.shape

(tensor([[15496,    11,   616,  1438,   318]]), torch.Size([1, 5]))

#### 第一步生成


$$2\times \text{batch\_size}\times \text{seq\_len}\times n_{\text{layers}}\times d_{model}\times precison$$

```
past_key_values = (
    (key_layer_1, value_layer_1),
    (key_layer_2, value_layer_2),
    ...
    (key_layer_N, value_layer_N)
)
```

- `{key/value}_layer_i` shape
    - (batch_size, num_heads, seq_length, head_dim)
- d_model = num_heads * head_dim

In [4]:
# 第一步生成
output = model(input_ids, use_cache=True)
next_token_logits = output.logits[:, -1, :]  # 获取最后一个时间步的 logits
past_key_values = output.past_key_values    # 缓存键和值

In [5]:
output.logits.shape

torch.Size([1, 5, 50257])

In [6]:
len(past_key_values), past_key_values[0][0].shape

(12, torch.Size([1, 12, 5, 64]))

In [22]:
# "vocab_size": 50257
# "n_layer": 12,
# "n_head": 12,
# "n_embd": 768,
model.config

GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.45.0.dev0",
  "use_cache": true,
  "vocab_size": 50257
}

#### 采样下一个 token

In [8]:
raw_past_key_values = output.past_key_values
raw_past_key_values[0][0].shape

torch.Size([1, 12, 5, 64])

In [9]:
# 采样下一个令牌（例如取最大概率的令牌）
next_token = torch.argmax(next_token_logits, dim=-1).unsqueeze(-1)

# 第二步生成，使用缓存
output = model(next_token, past_key_values=past_key_values, use_cache=True)
next_token_logits = output.logits[:, -1, :]
past_key_values = output.past_key_values

# 重复上述步骤，直到生成结束

In [10]:
past_key_values[0][0].shape

torch.Size([1, 12, 6, 64])

In [11]:
raw_past_key_values[0][0].shape, past_key_values[0][0].shape

(torch.Size([1, 12, 5, 64]), torch.Size([1, 12, 6, 64]))

In [12]:
torch.allclose(past_key_values[0][0][:, :, :5, :], raw_past_key_values[0][0])

True

### 对比

In [13]:
model_name = 'gpt2'  
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# 将模型设置为评估模式
model.eval()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [14]:
input_text = "Once upon a time"
input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)  # 形状: (1, seq_length)

In [15]:
max_length = 30  # 生成的最大长度，包括输入长度

In [16]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [20]:
# 使用 model.generate 生成文本（贪婪搜索）
greedy_output = model.generate(
    input_ids,
    max_length=max_length,
    num_return_sequences=1,
    do_sample=False  # 关闭采样，使用贪婪搜索
)
greedy_text = tokenizer.decode(greedy_output[0], skip_special_tokens=True)
print("\n=== 使用 model.generate（贪婪搜索）生成的文本 ===")
print(greedy_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



=== 使用 model.generate（贪婪搜索）生成的文本 ===
Once upon a time, the world was a place of great beauty and great danger. The world was a place of great danger, and the world was


In [21]:
# 手动逐步生成文本（贪婪搜索）
generated_tokens = input_ids
past_key_values = None

steps = max_length - input_ids.shape[1]

for step in range(steps):
    if step == 0:
        # 第一轮，传递整个输入
        outputs = model(generated_tokens, use_cache=True)
    else:
        # 后续轮次，只传递最后一个 token
        outputs = model(next_token, use_cache=True, past_key_values=past_key_values)
    
    # 更新 past_key_values
    past_key_values = outputs.past_key_values
    
    # 获取 logits 并选择下一个 token
    next_token_logits = outputs.logits[:, -1, :]  # 取最后一个时间步的 logits
    
    # 选择概率最高的 token（贪婪搜索）
    next_token = torch.argmax(next_token_logits, dim=-1).unsqueeze(-1)  # 形状: (batch_size, 1)
    
    # 将新生成的 token 添加到生成的序列中
    generated_tokens = torch.cat((generated_tokens, next_token), dim=1)


# 解码生成的文本
greedy_loop_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
print("\n=== 使用循环和 past_key_values（贪婪搜索）逐步生成的文本 ===")
print(greedy_loop_text)


=== 使用循环和 past_key_values（贪婪搜索）逐步生成的文本 ===
Once upon a time, the world was a place of great beauty and great danger. The world was a place of great danger, and the world was
