# Initial Settings:

In [1]:
import torch
from torch import nn

In [24]:
class GPT2Model(nn.Module):
  def __init__(self, cfg_map):
    super().__init__()

    # embedding components
    self.emb_layer = nn.Embedding(cfg_map['vocab_size'], cfg_map['emb_dim'])
    self.pos_emb_layer = nn.Embedding(cfg_map['context_length'], cfg_map['emb_dim'])

    # huh
    self.dropout = nn.Dropout(cfg_map['drop_rate'])

    # transformer
    self.trfm_block = nn.Sequential(*[TransformerBlock(cfg_map) for i in range(cfg_map['n_layers'])])

    self.final_norm = LayerNorm(cfg_map['emb_dim'])

    # convert to logits
    self.out_head = nn.Linear(cfg_map['emb_dim'], cfg_map['vocab_size'], bias = False)

  def forward(self, in_idx):
    batch_size, seq_len = in_idx.shape
    tok_embed = self.emb_layer(in_idx)
    pos_embed = self.pos_emb_layer(torch.arange(seq_len, device = in_idx.device))
    x = tok_embed + pos_embed
    x = self.dropout(x)
    x = self.trfm_block(x)
    x = self.final_norm(x)

    logits = self.out_head(x)

    return logits

class TransformerBlock(nn.Module):
  def __init__(self, cfg_map):
    super().__init__()
    self.norm_1 = LayerNorm(cfg_map['emb_dim'])
    self.multihead_attention = MultiheadAttention(cfg_map['emb_dim'], cfg_map['emb_dim'],
                                                  cfg_map['drop_rate'], cfg_map['context_length'],
                                                  cfg_map['n_heads'], cfg_map['qkv_bias'])
    self.dropout = nn.Dropout(cfg_map['drop_rate'])
    self.norm_2 = LayerNorm(cfg_map['emb_dim'])
    self.ffw = FeedForward(cfg_map)
  def forward(self, x):
    shortcut_x  = x
    x = self.norm_1(x)
    x = self.multihead_attention(x)
    x = self.dropout(x)
    x = shortcut_x + x

    shortcut_x = x
    x = self.norm_2(x)
    x = self.ffw(x)
    x = self.dropout(x)
    x = x + shortcut_x
    return x

class LayerNorm(nn.Module):
  def __init__(self, emb_dim, eps=1e-5):
    super().__init__()
    self.epsilon = eps

    # learnable params to tweak the layer norm
    self.scale = nn.Parameter(torch.ones(emb_dim))
    self.shift = nn.Parameter(torch.zeros(emb_dim))

  def forward(self, x):
    mean = torch.mean(x, dim = -1, keepdim = True)
    var = torch.var(x, dim = -1, keepdim = True, correction = False)
    norm_x = (x - mean) / (var + self.epsilon)**0.5
    return self.scale * norm_x + self.shift

class GELULayer(nn.Module):
  def __init__(self):
    super().__init__()

  def forward(self, x):
    return 0.5 * x * (1 + torch.tanh((2/torch.pi)**0.5 * (x + 0.044715 * x**3)))

class FeedForward(nn.Module):
  def __init__(self, cfg):
    super().__init__()
    self.layers = nn.Sequential(
        nn.Linear(cfg['emb_dim'], 4 * cfg['emb_dim']), # domain expansion
        GELULayer(), # just gelu for non-linear
        nn.Linear(4 * cfg['emb_dim'], cfg['emb_dim']), # domain contraction
    )

  def forward(self, x):
    return self.layers(x)

class MultiheadAttention(nn.Module):
  def __init__(self, d_in, d_out, drop_out_rate, context_length, num_heads, ena_bias = False):
    super().__init__()

    assert (d_out % num_heads == 0), \
      "d_out must be divisible by num_heads"

    self.d_in = d_in
    self.d_out = d_out
    self.num_heads = num_heads
    self.head_dim = self.d_out // self.num_heads

    self.W_Q = nn.Linear(d_in, d_out, bias = ena_bias)
    self.W_K = nn.Linear(d_in, d_out, bias = ena_bias)
    self.W_V = nn.Linear(d_in, d_out, bias = ena_bias)

    # projection?
    self.out_proj = nn.Linear(d_out, d_out)

    self.drop_out_layer = nn.Dropout(drop_out_rate)
    self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal = 1))

  def forward(self, x):

    batch, num_tokens, d_in = x.shape

    queries = self.W_Q(x)
    keys = self.W_K(x)
    values = self.W_V(x)

    queries = queries.view(batch, num_tokens, self.num_heads, self.head_dim)
    keys = keys.view(batch, num_tokens, self.num_heads, self.head_dim)
    values = values.view(batch, num_tokens, self.num_heads, self.head_dim)

    queries = queries.transpose(1,2)
    keys = keys.transpose(1,2)
    values = values.transpose(1,2)

    attention_score = queries @ keys.transpose(2, 3)

    mask_bool = self.mask.bool()[:num_tokens, :num_tokens]
    attention_score.masked_fill_(mask_bool, -torch.inf)
    attention_score = attention_score / self.head_dim**0.5
    attention_weight = torch.softmax(attention_score, dim = -1)
    attention_weight = self.drop_out_layer(attention_weight)

    context_vectors = (attention_weight @ values).transpose(1, 2)

    context_vectors = context_vectors.contiguous().view(batch, num_tokens, self.d_out)

    # combs for learning relationship of head's results
    context_vectors = self.out_proj(context_vectors)

    return context_vectors


In [6]:
GPT_CONFIG_124M = {
  "vocab_size": 50257, # Vocabulary size
  "context_length": 256, # Context length
  "emb_dim": 768, # Embedding dimension
  "n_heads": 12, # Number of multihead_attentionention heads
  "n_layers": 12, # Number of layers
  "drop_rate": 0.1, # Dropout rate
  "qkv_bias": False # Query-Key-Value bias
}

In [3]:
import tiktoken

def text_to_token_ids(text, tokenizer):
  encoded = tokenizer.encode(text,  allowed_special = {'<|endoftext|>'})
  encoded_tensor = torch.tensor(encoded).unsqueeze(0) # add batch dim
  return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
  # tiktoken only accept integer numpy array, never tensor
  if isinstance(token_ids, torch.Tensor):
      token_ids = token_ids.squeeze().tolist()  # [seq_len]
  return tokenizer.decode(token_ids)

In [4]:
def generate(model, idx, context_length, maximum_token, temperature = 0.0, topk = None, eos_id = None):
  for i in range(maximum_token):
    # slice the input for acceptable input size (<= context length)
    idx = idx[:, -context_length:]

    with torch.no_grad():
      logits = model(idx)

    last_vector = logits[:, -1, :]

    if topk is not None:
      top_logits, _ = torch.topk(last_vector, topk)

      min_val = top_logits[:, -1]

      last_vector = torch.where(
          last_vector < min_val,
          torch.tensor(float('-inf')).to(last_vector.device),
          last_vector
      )

    if temperature > 0.0: #3
      last_vector = last_vector / temperature
      probs = torch.softmax(last_vector, dim=-1)
      idx_next = torch.multinomial(probs, num_samples=1)
    else:
      idx_next = torch.argmax(last_vector, dim = -1, keepdim = True)

    if idx_next == eos_id:
      break

    idx = torch.cat((idx, idx_next), dim = 1) # (batch, num_token, vocab_size)

  return idx



# Just downloading stuffws:

In [7]:
pip install tensorflow>=2.15.0 tqdm>=4.66

In [8]:
import urllib.request
url = (
  "https://raw.githubusercontent.com/rasbt/"
  "LLMs-from-scratch/main/ch05/"
  "01_main-chapter-code/gpt_download.py"
)
filename = url.split('/')[-1]
urllib.request.urlretrieve(url, filename)

('gpt_download.py', <http.client.HTTPMessage at 0x7e72d471e480>)

In [9]:
from gpt_download import download_and_load_gpt2
settings, params = download_and_load_gpt2(
model_size="355M", models_dir="gpt2"
)

checkpoint: 100%|██████████| 77.0/77.0 [00:00<00:00, 117kiB/s]
encoder.json: 100%|██████████| 1.04M/1.04M [00:00<00:00, 2.29MiB/s]
hparams.json: 100%|██████████| 91.0/91.0 [00:00<00:00, 131kiB/s]
model.ckpt.data-00000-of-00001: 100%|██████████| 1.42G/1.42G [02:05<00:00, 11.3MiB/s]
model.ckpt.index: 100%|██████████| 10.4k/10.4k [00:00<00:00, 20.2MiB/s]
model.ckpt.meta: 100%|██████████| 927k/927k [00:00<00:00, 1.73MiB/s]
vocab.bpe: 100%|██████████| 456k/456k [00:00<00:00, 1.78MiB/s]


In [10]:
print("Settings:", settings)
print("Parameter dictionary keys:", params.keys())

Settings: {'n_vocab': 50257, 'n_ctx': 1024, 'n_embd': 1024, 'n_head': 16, 'n_layer': 24}
Parameter dictionary keys: dict_keys(['blocks', 'b', 'g', 'wpe', 'wte'])


In [11]:
print(params["wte"])
print("Token embedding weight tensor dimensions:", params["wte"].shape)

[[-0.0115168   0.00311915 -0.00729894 ... -0.05262156 -0.17569277
   0.02565791]
 [-0.00861426  0.06360211 -0.01822355 ... -0.01364703 -0.12153847
   0.05352487]
 [ 0.05854857  0.06891199  0.02622696 ... -0.10057542 -0.19788682
  -0.0039184 ]
 ...
 [ 0.00162342 -0.04411932 -0.0517492  ... -0.10079621 -0.00865952
   0.02637872]
 [-0.14374605 -0.04632217 -0.00650705 ...  0.07464293 -0.04721651
  -0.03829013]
 [ 0.02065966 -0.01334631 -0.02586888 ...  0.03886637 -0.00233481
   0.00107106]]
Token embedding weight tensor dimensions: (50257, 1024)


# Load the setting and params

In [12]:
model_configs = {
  "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
  "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
  "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
  "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}

In [13]:
model_name = "gpt2-medium (355M)"
NEW_CONFIG = GPT_CONFIG_124M.copy()
NEW_CONFIG.update(model_configs[model_name])
NEW_CONFIG.update({"context_length": 1024})
NEW_CONFIG.update({"qkv_bias": True})

In [25]:
gpt2 = GPT2Model(NEW_CONFIG)
gpt2.eval()

GPT2Model(
  (emb_layer): Embedding(50257, 1024)
  (pos_emb_layer): Embedding(1024, 1024)
  (dropout): Dropout(p=0.1, inplace=False)
  (trfm_block): Sequential(
    (0): TransformerBlock(
      (norm_1): LayerNorm()
      (multihead_attention): MultiheadAttention(
        (W_Q): Linear(in_features=1024, out_features=1024, bias=True)
        (W_K): Linear(in_features=1024, out_features=1024, bias=True)
        (W_V): Linear(in_features=1024, out_features=1024, bias=True)
        (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
        (drop_out_layer): Dropout(p=0.1, inplace=False)
      )
      (dropout): Dropout(p=0.1, inplace=False)
      (norm_2): LayerNorm()
      (ffw): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=1024, out_features=4096, bias=True)
          (1): GELULayer()
          (2): Linear(in_features=4096, out_features=1024, bias=True)
        )
      )
    )
    (1): TransformerBlock(
      (norm_1): LayerNorm()
      (mu

In [20]:
def assign(left, right):
  a = left.shape
  print(" left ok.")
  b = right.shape
  print(" right ok.")
  if left.shape != right.shape:
      raise ValueError(f"Shape mismatch. Left: {left.shape}, "
                        "Right: {right.shape}"
                        )
  return torch.nn.Parameter(torch.tensor(right))

In [18]:
import numpy as np

def load_weights_into_gpt(gpt, params):
  # position embedding layer
  gpt.pos_emb_layer.weight = assign(gpt.pos_emb_layer.weight, params['wpe'])
  print("Position embedding done!")

  # token embedding layer
  gpt.emb_layer.weight = assign(gpt.emb_layer.weight, params['wte'])
  print("Token embedding done!")

  # transformer block:
  for b in range(len(params['blocks'])):
    # take the weight from params, split it into Q, K, V
    q_w, k_w, v_w = np.split((params["blocks"][b]["attn"]["c_attn"])["w"], 3, axis=-1)

    # load into our instance
    gpt.trfm_block[b].multihead_attention.W_Q.weight = assign(gpt.trfm_block[b].multihead_attention.W_Q.weight, q_w.T)
    gpt.trfm_block[b].multihead_attention.W_K.weight = assign(gpt.trfm_block[b].multihead_attention.W_K.weight, k_w.T)
    gpt.trfm_block[b].multihead_attention.W_V.weight = assign(gpt.trfm_block[b].multihead_attention.W_V.weight, v_w.T)
    print("QKV weight done!")

    # bias for Q, K, V
    q_b, k_b, v_b = np.split((params["blocks"][b]["attn"]["c_attn"])["b"], 3, axis=-1)
    gpt.trfm_block[b].multihead_attention.W_Q.bias = assign(gpt.trfm_block[b].multihead_attention.W_Q.bias, q_b)
    gpt.trfm_block[b].multihead_attention.W_K.bias = assign(gpt.trfm_block[b].multihead_attention.W_K.bias, k_b)
    gpt.trfm_block[b].multihead_attention.W_V.bias = assign(gpt.trfm_block[b].multihead_attention.W_V.bias, v_b)
    print("QKV bias done!")

    # weight for outer projection
    gpt.trfm_block[b].multihead_attention.out_proj.weight = assign(gpt.trfm_block[b].multihead_attention.out_proj.weight, params["blocks"][b]["attn"]["c_proj"]["w"].T)
    gpt.trfm_block[b].multihead_attention.out_proj.bias = assign(gpt.trfm_block[b].multihead_attention.out_proj.bias, params["blocks"][b]["attn"]["c_proj"]["b"])
    print("Outer projection done!")

    # feed forward
    gpt.trfm_block[b].ffw.layers[0].weight = assign(gpt.trfm_block[b].ffw.layers[0].weight, params["blocks"][b]["mlp"]["c_fc"]["w"].T)
    gpt.trfm_block[b].ffw.layers[0].bias = assign(gpt.trfm_block[b].ffw.layers[0].bias, params["blocks"][b]["mlp"]["c_fc"]["b"])
    gpt.trfm_block[b].ffw.layers[2].weight = assign(gpt.trfm_block[b].ffw.layers[2].weight, params["blocks"][b]["mlp"]["c_proj"]["w"].T)
    gpt.trfm_block[b].ffw.layers[2].bias = assign(gpt.trfm_block[b].ffw.layers[2].bias, params["blocks"][b]["mlp"]["c_proj"]["b"])
    print("Feed forward done!")

    # normalize
    gpt.trfm_block[b].norm_1.scale = assign(gpt.trfm_block[b].norm_1.scale, params["blocks"][b]["ln_1"]["g"])
    gpt.trfm_block[b].norm_1.shift = assign(gpt.trfm_block[b].norm_1.shift, params["blocks"][b]["ln_1"]["b"])
    gpt.trfm_block[b].norm_2.scale = assign(gpt.trfm_block[b].norm_2.scale, params["blocks"][b]["ln_2"]["g"])
    gpt.trfm_block[b].norm_2.shift = assign(gpt.trfm_block[b].norm_2.shift, params["blocks"][b]["ln_2"]["b"])
    print("Norm trf done!")

  # final norm
  gpt.final_norm.scale = assign(gpt.final_norm.scale, params["g"])
  gpt.final_norm.shift = assign(gpt.final_norm.shift, params["b"])
  print("Final norm done!")

  # OG GPT2 reuse the token embedding weight
  gpt.out_head.weight = assign(gpt.out_head.weight, params["wte"])
  print("Out head done!")

In [26]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
load_weights_into_gpt(gpt2, params)
gpt2.to(device)

 left ok.
 right ok.
Position embedding done!
 left ok.
 right ok.
Token embedding done!
 left ok.
 right ok.
 left ok.
 right ok.
 left ok.
 right ok.
QKV weight done!
 left ok.
 right ok.
 left ok.
 right ok.
 left ok.
 right ok.
QKV bias done!
 left ok.
 right ok.
 left ok.
 right ok.
Outer projection done!
 left ok.
 right ok.
 left ok.
 right ok.
 left ok.
 right ok.
 left ok.
 right ok.
Feed forward done!
 left ok.
 right ok.
 left ok.
 right ok.
 left ok.
 right ok.
 left ok.
 right ok.
Norm trf done!
 left ok.
 right ok.
 left ok.
 right ok.
 left ok.
 right ok.
QKV weight done!
 left ok.
 right ok.
 left ok.
 right ok.
 left ok.
 right ok.
QKV bias done!
 left ok.
 right ok.
 left ok.
 right ok.
Outer projection done!
 left ok.
 right ok.
 left ok.
 right ok.
 left ok.
 right ok.
 left ok.
 right ok.
Feed forward done!
 left ok.
 right ok.
 left ok.
 right ok.
 left ok.
 right ok.
 left ok.
 right ok.
Norm trf done!
 left ok.
 right ok.
 left ok.
 right ok.
 left ok.
 right ok

GPT2Model(
  (emb_layer): Embedding(50257, 1024)
  (pos_emb_layer): Embedding(1024, 1024)
  (dropout): Dropout(p=0.1, inplace=False)
  (trfm_block): Sequential(
    (0): TransformerBlock(
      (norm_1): LayerNorm()
      (multihead_attention): MultiheadAttention(
        (W_Q): Linear(in_features=1024, out_features=1024, bias=True)
        (W_K): Linear(in_features=1024, out_features=1024, bias=True)
        (W_V): Linear(in_features=1024, out_features=1024, bias=True)
        (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
        (drop_out_layer): Dropout(p=0.1, inplace=False)
      )
      (dropout): Dropout(p=0.1, inplace=False)
      (norm_2): LayerNorm()
      (ffw): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=1024, out_features=4096, bias=True)
          (1): GELULayer()
          (2): Linear(in_features=4096, out_features=1024, bias=True)
        )
      )
    )
    (1): TransformerBlock(
      (norm_1): LayerNorm()
      (mu

In [30]:
torch.manual_seed(42)
start_context = "Holden Caufield"
tokenizer = tiktoken.get_encoding("gpt2")

token_ids = generate(
  model=gpt2,
  idx=text_to_token_ids(start_context, tokenizer),
  maximum_token=100,
  context_length=GPT_CONFIG_124M["context_length"],
  temperature = 3,
  topk = 5,
)

print("Output text:\n", token_ids_to_text(token_ids, tokenizer))

Output text:
 Holden Caufield's wife was a teacher at a private school, but she didn't know her son, David, until the family's daughter was in kindergarten. "I had been in the business of raising kids since I got here and I'd always wanted to have one, and I knew I was going to get it someday," she said of her husband and son, a senior who played basketball. "He's the kind of guy who's going to give you the best chance to do that in life and I
