In [2]:
from gpt import *

In [3]:
 GPT_CONFIG_124M = {
        "vocab_size": 50257,     # Vocabulary size
        "context_length": 1024,  # Context length
        "emb_dim": 768,          # Embedding dimension
        "n_heads": 12,           # Number of attention heads
        "n_layers": 12,          # Number of layers
        "drop_rate": 0.1,        # Dropout rate
        "qkv_bias": False        # Query-Key-Value bias
    }

In [17]:
block = TransformerBlock(GPT_CONFIG_124M)

## Exercise 4.1

Calculate and compare the number of parameters that are contained in the feed forward module and those that are contained in the multi-head attention module. 

In [18]:
total_params = sum(p.numel() for p in block.parameters())
print(f"Total number of parameters in the transformer block: {total_params:,}")

Total number of parameters in the transformer block: 7,085,568


In [19]:
total_params_ff =  sum(p.numel() for p in block.ff.parameters())
print(f"Number of trainable parameters in feed forward block: {total_params_ff:,}")

Number of trainable parameters in feed forward block: 4,722,432


In [24]:
total_params_att =  sum(p.numel() for p in block.att.parameters())
print(f"Number of trainable parameters in attention block: {total_params_att:,}")


Number of trainable parameters in attention block: 2,360,064


In [22]:
print(block)

TransformerBlock(
  (att): MultiHeadAttention(
    (W_query): Linear(in_features=768, out_features=768, bias=False)
    (W_key): Linear(in_features=768, out_features=768, bias=False)
    (W_value): Linear(in_features=768, out_features=768, bias=False)
    (out_proj): Linear(in_features=768, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (ff): FeedForward(
    (layers): Sequential(
      (0): Linear(in_features=768, out_features=3072, bias=True)
      (1): GELU()
      (2): Linear(in_features=3072, out_features=768, bias=True)
    )
  )
  (norm1): LayerNorm()
  (norm2): LayerNorm()
  (drop_shortcut): Dropout(p=0.1, inplace=False)
)


## Exercise 4.2

We initialized a 124-million-parameter GPT model, which is known as “GPT-2 small.” Without making any code modifications besides updating the configuration file, use the GPTModel class to implement GPT-2 medium (using 1,024-dimensional embeddings, 24 transformer blocks, 16 multi-head attention heads), GPT-2 large (1,280-dimensional embeddings, 36 transformer blocks, 20 multi-head attention heads), and GPT-2 XL (1,600-dimensional embeddings, 48 transformer blocks, 25 multi-head attention heads). As a bonus, calculate the total number of parameters in each GPT model.

In [31]:


# Define configuration dictionaries for each GPT-2 variant (params according to @exercise-solutions.ipynb)

GPT2_SMALL_CONFIG = {
    "vocab_size": 50257,
    "block_size": 1024,
    "n_layers": 12,
    "n_heads": 12,
    "emb_dim": 768,
    "ffn_hidden_mult": 4,
    "drop_rate": 0.1,
    "bias": True,
    "qkv_bias": False
}

GPT2_MEDIUM_CONFIG = {
    "vocab_size": 50257,
    "block_size": 1024,
    "n_layers": 24,
    "n_heads": 16,
    "emb_dim": 1024,
    "ffn_hidden_mult": 4,
    "drop_rate": 0.1,
    "bias": True,
    "qkv_bias": False
}

GPT2_LARGE_CONFIG = {
    "vocab_size": 50257,
    "block_size": 1024,
    "n_layers": 36,
    "n_heads": 20,
    "emb_dim": 1280,
    "ffn_hidden_mult": 4,
    "drop_rate": 0.1,
    "bias": True,
    "qkv_bias": False
}

GPT2_XL_CONFIG = {
    "vocab_size": 50257,
    "block_size": 1024,
    "n_layers": 48,
    "n_heads": 25,
    "emb_dim": 1600,
    "ffn_hidden_mult": 4,
    "drop_rate": 0.1,
    "bias": True,
    "qkv_bias": False
}


GPT2_SMALL_CONFIG["context_length"] = 1024
GPT2_MEDIUM_CONFIG["context_length"] = 1024
GPT2_LARGE_CONFIG["context_length"] = 1024
GPT2_XL_CONFIG["context_length"] = 1024


In [25]:
def model_stats(model):
    total_params = sum(p.numel() for p in model.parameters())
    # Calculate the total size in bytes (assuming float32, 4 bytes per parameter)
    total_size_bytes = total_params * 4

    # Convert to megabytes
    total_size_mb = total_size_bytes / (1024 * 1024)

    print(f"Total number of parameters: {total_params:,}")
    print(f"Total size in bytes: {total_size_bytes:,}")
    print(f"Total size in megabytes: {total_size_mb:.2f} MB")

In [32]:
gpt_models = {
    "GPT-2 small": GPT2_SMALL_CONFIG,
    "GPT-2 medium": GPT2_MEDIUM_CONFIG,
    "GPT-2 large": GPT2_LARGE_CONFIG,
    "GPT-2 XL": GPT2_XL_CONFIG
}

for model_name, config in gpt_models.items():
    print(f"\n{model_name}:")
    model = GPTModel(config)
    model_stats(model)


GPT-2 small:
Total number of parameters: 163,009,536
Total size in bytes: 652,038,144
Total size in megabytes: 621.83 MB

GPT-2 medium:
Total number of parameters: 406,212,608
Total size in bytes: 1,624,850,432
Total size in megabytes: 1549.58 MB

GPT-2 large:
Total number of parameters: 838,220,800
Total size in bytes: 3,352,883,200
Total size in megabytes: 3197.56 MB

GPT-2 XL:
Total number of parameters: 1,637,792,000
Total size in bytes: 6,551,168,000
Total size in megabytes: 6247.68 MB
