In [8]:
from importlib.metadata import version

pkgs = ["numpy", "torch", "transformers"]
for p in pkgs:
    print(f"{p} version: {version(p)}")

numpy version: 1.26.4
torch version: 2.9.0
transformers version: 4.57.1


In [9]:
from transformers import GPT2Model


# allowed model names
model_names = {
    "gpt2-small (124M)": "openai-community/gpt2",
    "gpt2-medium (355M)": "openai-community/gpt2-medium",
    "gpt2-large (774M)": "openai-community/gpt2-large",
    "gpt2-xl (1558M)": "openai-community/gpt2-xl"
}

CHOOSE_MODEL = "gpt2-small (124M)"

gpt_hf = GPT2Model.from_pretrained(model_names[CHOOSE_MODEL], cache_dir="checkpoints")
gpt_hf.eval()

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

GPT2Model(
  (wte): Embedding(50257, 768)
  (wpe): Embedding(1024, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0-11): 12 x GPT2Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2Attention(
        (c_attn): Conv1D(nf=2304, nx=768)
        (c_proj): Conv1D(nf=768, nx=768)
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): GPT2MLP(
        (c_fc): Conv1D(nf=3072, nx=768)
        (c_proj): Conv1D(nf=768, nx=3072)
        (act): NewGELUActivation()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)

In [10]:
BASE_CONFIG = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 1024, # Context length
    "drop_rate": 0.0,       # Dropout rate
    "qkv_bias": True        # Query-key-value bias
}

model_configs = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}


BASE_CONFIG.update(model_configs[CHOOSE_MODEL])

In [11]:
def assign_check(left, right):
    if left.shape != right.shape:
        raise ValueError(f"Shape mismatch. Left: {left.shape}, Right: {right.shape}")
    return torch.nn.Parameter(right.clone().detach())

In [50]:
import numpy as np


def load_weights(gpt, gpt_hf):

    d = gpt_hf.state_dict()

    gpt.pos_embed.weight = assign_check(gpt.pos_embed.weight, d["wpe.weight"])
    gpt.token_embed.weight = assign_check(gpt.token_embed.weight, d["wte.weight"])
    
    for b in range(BASE_CONFIG["n_layers"]):
        q_w, k_w, v_w = np.split(d[f"h.{b}.attn.c_attn.weight"], 3, axis=-1)
        gpt.block[b].att.wq.weight = assign_check(gpt.block[b].att.wq.weight, q_w.T)
        gpt.block[b].att.wk.weight = assign_check(gpt.block[b].att.wk.weight, k_w.T)
        gpt.block[b].att.wv.weight = assign_check(gpt.block[b].att.wv.weight, v_w.T)
    
        q_b, k_b, v_b = np.split(d[f"h.{b}.attn.c_attn.bias"], 3, axis=-1)
        gpt.block[b].att.wq.bias = assign_check(gpt.block[b].att.wq.bias, q_b)
        gpt.block[b].att.wk.bias = assign_check(gpt.block[b].att.wk.bias, k_b)
        gpt.block[b].att.wv.bias = assign_check(gpt.block[b].att.wv.bias, v_b)
    
    
        gpt.block[b].att.out_proj.weight = assign_check(gpt.block[b].att.out_proj.weight, d[f"h.{b}.attn.c_proj.weight"].T)
        gpt.block[b].att.out_proj.bias = assign_check(gpt.block[b].att.out_proj.bias, d[f"h.{b}.attn.c_proj.bias"])
    
        gpt.block[b].ffn[0].weight = assign_check(gpt.block[b].ffn[0].weight, d[f"h.{b}.mlp.c_fc.weight"].T)
        gpt.block[b].ffn[0].bias = assign_check(gpt.block[b].ffn[0].bias, d[f"h.{b}.mlp.c_fc.bias"])
        gpt.block[b].ffn[2].weight = assign_check(gpt.block[b].ffn[2].weight, d[f"h.{b}.mlp.c_proj.weight"].T)
        gpt.block[b].ffn[2].bias = assign_check(gpt.block[b].ffn[2].bias, d[f"h.{b}.mlp.c_proj.bias"])
    
        gpt.block[b].LayerNorm1.weight = assign_check(gpt.block[b].LayerNorm1.weight, d[f"h.{b}.ln_1.weight"])
        gpt.block[b].LayerNorm1.bias = assign_check(gpt.block[b].LayerNorm1.bias, d[f"h.{b}.ln_1.bias"])
        gpt.block[b].LayerNorm2.weight = assign_check(gpt.block[b].LayerNorm2.weight, d[f"h.{b}.ln_2.weight"])
        gpt.block[b].LayerNorm2.bias = assign_check(gpt.block[b].LayerNorm2.bias, d[f"h.{b}.ln_2.bias"])
    
        gpt.norm.weight = assign_check(gpt.norm.weight, d["ln_f.weight"])
        gpt.norm.bias = assign_check(gpt.norm.bias, d["ln_f.bias"])
        gpt.out_head.weight = assign_check(gpt.out_head.weight, d["wte.weight"])

In [51]:
import torch
from GPT模型架构 import GPT
# For llms_from_scratch installation instructions, see:
# https://github.com/rasbt/LLMs-from-scratch/tree/main/


gpt = GPT(BASE_CONFIG)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
load_weights(gpt, gpt_hf)

In [52]:
def generate(model, idx, max_new_tokens, context_size, 
             temperature=0.0, top_k=None, eos_id=None): 
   for _ in range(max_new_tokens): 
      idx_cond = idx[:, -context_size:] 
      with torch.no_grad(): 
         logits = model(idx_cond) 
      logits = logits[:, -1, :] 
      if top_k is not None: 
         top_logits, _ = torch.topk(logits, top_k)
         min_val = top_logits[:, -1] 
         logits = torch.where( 
            logits < min_val, 
            torch.tensor(float('-inf')).to(logits.device), 
            logits 
            ) 
         if temperature > 0.0: 
            logits = logits / temperature 
            probs = torch.softmax(logits, dim=-1) 
            idx_next = torch.multinomial(probs, num_samples=1) 
         else: 
            idx_next = torch.argmax(logits, dim=-1, keepdim=True) 
         if idx_next == eos_id: 
            break 
         idx = torch.cat((idx, idx_next), dim=1) 
   return idx

In [53]:
def text_to_id(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0)
    return encoded_tensor

def id_to_text(id, tokenizer):
    flat = id.squeeze(0)
    return tokenizer.decode(flat.tolist())

In [54]:
import tiktoken

torch.manual_seed(123)

tokenizer = tiktoken.get_encoding("gpt2")
gpt.eval()
token_ids = generate(
    model=gpt.to(device),
    idx=text_to_id("Every effort moves", tokenizer).to(device),
    max_new_tokens=30,
    context_size=BASE_CONFIG["context_length"],
    top_k=1,
    temperature=1.0
)

print("Output text:\n", id_to_text(token_ids, tokenizer))

Output text:
 Every effort moves the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the
