In [9]:
from gpt import *

In [2]:
 GPT_CONFIG_124M = {
        "vocab_size": 50257,     # Vocabulary size
        "context_length": 1024,  # Context length
        "emb_dim": 768,          # Embedding dimension
        "n_heads": 12,           # Number of attention heads
        "n_layers": 12,          # Number of layers
        "drop_rate": 0.1,        # Dropout rate
        "qkv_bias": False        # Query-Key-Value bias
    }

In [3]:
block = TransformerBlock(GPT_CONFIG_124M)

## Exercise 4.1

Calculate and compare the number of parameters that are contained in the feed forward module and those that are contained in the multi-head attention module. 

In [4]:
total_params = sum(p.numel() for p in block.parameters())
print(f"Total number of parameters in the transformer block: {total_params:,}")

Total number of parameters in the transformer block: 7,085,568


In [5]:
total_params_ff =  sum(p.numel() for p in block.ff.parameters())
print(f"Number of trainable parameters in feed forward block: {total_params_ff:,}")

Number of trainable parameters in feed forward block: 4,722,432


In [6]:
total_params_att =  sum(p.numel() for p in block.att.parameters())
print(f"Number of trainable parameters in attention block: {total_params_att:,}")


Number of trainable parameters in attention block: 2,360,064


In [7]:
print(block)

TransformerBlock(
  (att): MultiHeadAttention(
    (W_query): Linear(in_features=768, out_features=768, bias=False)
    (W_key): Linear(in_features=768, out_features=768, bias=False)
    (W_value): Linear(in_features=768, out_features=768, bias=False)
    (out_proj): Linear(in_features=768, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (ff): FeedForward(
    (layers): Sequential(
      (0): Linear(in_features=768, out_features=3072, bias=True)
      (1): GELU()
      (2): Linear(in_features=3072, out_features=768, bias=True)
    )
  )
  (norm1): LayerNorm()
  (norm2): LayerNorm()
  (drop_shortcut): Dropout(p=0.1, inplace=False)
)


## Exercise 4.2

We initialized a 124-million-parameter GPT model, which is known as “GPT-2 small.” Without making any code modifications besides updating the configuration file, use the GPTModel class to implement GPT-2 medium (using 1,024-dimensional embeddings, 24 transformer blocks, 16 multi-head attention heads), GPT-2 large (1,280-dimensional embeddings, 36 transformer blocks, 20 multi-head attention heads), and GPT-2 XL (1,600-dimensional embeddings, 48 transformer blocks, 25 multi-head attention heads). As a bonus, calculate the total number of parameters in each GPT model.

In [8]:


# Define configuration dictionaries for each GPT-2 variant (params according to @exercise-solutions.ipynb)

GPT2_SMALL_CONFIG = {
    "vocab_size": 50257,
    "block_size": 1024,
    "n_layers": 12,
    "n_heads": 12,
    "emb_dim": 768,
    "ffn_hidden_mult": 4,
    "drop_rate": 0.1,
    "bias": True,
    "qkv_bias": False
}

GPT2_MEDIUM_CONFIG = {
    "vocab_size": 50257,
    "block_size": 1024,
    "n_layers": 24,
    "n_heads": 16,
    "emb_dim": 1024,
    "ffn_hidden_mult": 4,
    "drop_rate": 0.1,
    "bias": True,
    "qkv_bias": False
}

GPT2_LARGE_CONFIG = {
    "vocab_size": 50257,
    "block_size": 1024,
    "n_layers": 36,
    "n_heads": 20,
    "emb_dim": 1280,
    "ffn_hidden_mult": 4,
    "drop_rate": 0.1,
    "bias": True,
    "qkv_bias": False
}

GPT2_XL_CONFIG = {
    "vocab_size": 50257,
    "block_size": 1024,
    "n_layers": 48,
    "n_heads": 25,
    "emb_dim": 1600,
    "ffn_hidden_mult": 4,
    "drop_rate": 0.1,
    "bias": True,
    "qkv_bias": False
}


GPT2_SMALL_CONFIG["context_length"] = 1024
GPT2_MEDIUM_CONFIG["context_length"] = 1024
GPT2_LARGE_CONFIG["context_length"] = 1024
GPT2_XL_CONFIG["context_length"] = 1024


In [9]:
def model_stats(model):
    total_params = sum(p.numel() for p in model.parameters())
    # Calculate the total size in bytes (assuming float32, 4 bytes per parameter)
    total_size_bytes = total_params * 4

    # Convert to megabytes
    total_size_mb = total_size_bytes / (1024 * 1024)

    print(f"Total number of parameters: {total_params:,}")
    print(f"Total size in bytes: {total_size_bytes:,}")
    print(f"Total size in megabytes: {total_size_mb:.2f} MB")

In [10]:
gpt_models = {
    "GPT-2 small": GPT2_SMALL_CONFIG,
    "GPT-2 medium": GPT2_MEDIUM_CONFIG,
    "GPT-2 large": GPT2_LARGE_CONFIG,
    "GPT-2 XL": GPT2_XL_CONFIG
}

for model_name, config in gpt_models.items():
    print(f"\n{model_name}:")
    model = GPTModel(config)
    model_stats(model)


GPT-2 small:


Total number of parameters: 163,009,536
Total size in bytes: 652,038,144
Total size in megabytes: 621.83 MB

GPT-2 medium:
Total number of parameters: 406,212,608
Total size in bytes: 1,624,850,432
Total size in megabytes: 1549.58 MB

GPT-2 large:
Total number of parameters: 838,220,800
Total size in bytes: 3,352,883,200
Total size in megabytes: 3197.56 MB

GPT-2 XL:
Total number of parameters: 1,637,792,000
Total size in bytes: 6,551,168,000
Total size in megabytes: 6247.68 MB


## Exercise 4.3

At the beginning of this chapter, we defined a global drop_rate setting in the GPT_ CONFIG_124M dictionary to set the dropout rate in various places throughout the GPTModel architecture. Change the code to specify a separate dropout value for the various dropout layers throughout the model architecture. (Hint: there are three distinct places where we used dropout layers: the embedding layer, shortcut layer, and multi-head attention module.)

In [6]:
 GPT_CONFIG_MOD = {
        "vocab_size": 50257,     # Vocabulary size
        "context_length": 1024,  # Context length
        "emb_dim": 768,          # Embedding dimension
        "n_heads": 12,           # Number of attention heads
        "n_layers": 12,          # Number of layers
        "drop_rate_emb": 0.1,        # Dropout rate
        "drop_rate_attn": 0.2,        # Dropout rate
        "drop_rate_shortcut": 0.25,        # Dropout rate
        "qkv_bias": False        # Query-Key-Value bias
    }

In [10]:
class TransformerBlock_mod(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.att = MultiHeadAttention(
            d_in=cfg["emb_dim"],
            d_out=cfg["emb_dim"],
            context_length=cfg["context_length"],
            num_heads=cfg["n_heads"],
            dropout=cfg["drop_rate_attn"], # drop_rate for multi-head attention module
            qkv_bias=cfg["qkv_bias"])
        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
        self.drop_shortcut = nn.Dropout(cfg["drop_rate_shortcut"]) # drop_rate for shortcut layer

    def forward(self, x):
        # Shortcut connection for attention block
        shortcut = x
        x = self.norm1(x)
        x = self.att(x)   # Shape [batch_size, num_tokens, emb_size]
        x = self.drop_shortcut(x)
        x = x + shortcut  # Add the original input back

        # Shortcut connection for feed-forward block
        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_shortcut(x)
        x = x + shortcut  # Add the original input back

        return x


class GPTModel_mod(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate_emb"]) # drop_rate for embedding layer

        self.trf_blocks = nn.Sequential(
            *[TransformerBlock_mod(cfg) for _ in range(cfg["n_layers"])])

        self.final_norm = LayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False)

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        x = tok_embeds + pos_embeds  # Shape [batch_size, num_tokens, emb_size]
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

In [11]:
model_drop = GPTModel_mod(GPT_CONFIG_MOD)