<a href="https://colab.research.google.com/github/klei22/nanoGPT/blob/master/llm_size_calculations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
def transformer_count_params(d_model=512, d_ff=2048, encoder=True, approx=False):
    """
    Calculate the number of parameters in Transformer Encoder/Decoder.
    Formulas are the following:
        multi-head attention: 4*(d_model^2 + d_model)
            if approx=False, 4*d_model^2 otherwise
        feed-forward: 2*d_model*d_ff + d_model + d_ff
            if approx=False, 2*d_model*d_ff otherwise
        layer normalization: 2*d_model if approx=False, 0 otherwise

    Encoder block consists of:
        1 multi-head attention block,
        1 feed-forward net, and
        2 layer normalizations.
    Decoder block consists of:
        2 multi-head attention blocks,
        1 feed-forward net, and
        3 layer normalizations.

    :param d_model: (int) model dimensionality
    :param d_ff: (int) internal dimensionality of a feed-forward neural network
    :param encoder: (bool) if True, return the number of parameters of the Encoder,
        otherwise the Decoder
    :param approx: (bool) if True, result is approximate (see formulas)
    :return: (int) number of learnable parameters in Transformer Encoder/Decoder
    """

    attention = 4 * (d_model ** 2 + d_model) if not approx else 4 * d_model ** 2
    feed_forward = 2 * d_model * d_ff + d_model + d_ff if not approx else 2 * d_model * d_ff
    layer_norm = 2 * d_model if not approx else 0

    return attention + feed_forward + 2 * layer_norm \
        if encoder else 2 * attention + feed_forward + 3 * layer_norm

In [None]:
transformer_count_params(d_model=384, d_ff=2048, encoder=False)

2760320

from dataclasses import dataclass

@dataclass
class GPTConfig:
    block_size: int = 256
    vocab_size: int = 50304  # Adjust as needed
    n_layer: int = 6
    n_head: int = 6
    n_embd: int = 384
    dropout: float = 0.1
    softmax_variant_attn: str = "softmax"
    softmax_variant_output: str = "softmax"
    use_abs_pos_embeddings: bool = False
    use_rotary_embeddings: bool = True
    rope_variant: str = "rope"
    shortrope_length: int = 8
    use_post_ln: bool = True
    use_pre_ln: bool = False
    layernorm_variant: str = "rmsnorm"
    bias: bool = False
    activation_variant: str = "gelu"

def calculate_parameters(config: GPTConfig) -> int:
    # Token embedding parameters
    token_embedding_params = config.vocab_size * config.n_embd
    
    # Positional embedding parameters (if using absolute position embeddings)
    position_embedding_params = config.block_size * config.n_embd if config.use_abs_pos_embeddings else 0
    
    # Transformer parameters
    transformer_params = 0
    for _ in range(config.n_layer):
        # Layer normalization parameters (assuming 2 per block for pre and post-layernorm)
        ln_params = 2 * config.n_embd * 2  # Two layer norms per block, scale and bias
        
        # Self-attention parameters
        attn_params = (config.n_embd * 3 * config.n_embd) + (config.n_embd * config.n_embd)  # QKV projections and output projection
        if config.bias:
            attn_params += 4 * config.n_embd  # Adding bias terms for QKV and output projection
        
        # MLP parameters
        mlp_params = (config.n_embd * 4 * config.n_embd) + (4 * config.n_embd * config.n_embd)  # FC and proj layers
        if config.bias:
            mlp_params += 2 * 4 * config.n_embd  # Adding bias terms for FC and proj layers
        
        transformer_params += ln_params + attn_params + mlp_params
    
    # Total parameters
    total_params = token_embedding_params + position_embedding_params + transformer_params
    return total_params

# Example usage
config = GPTConfig(n_layer=6, n_embd=)  # Example configuration
total_params = calculate_parameters(config)
print(f"Total Parameters: {total_params}")


from dataclasses import dataclass

@dataclass
class GPTConfig:
    block_size: int = 1024
    vocab_size: int = 50304  # Vocabulary size
    n_layer: int = 12
    n_head: int = 12
    n_embd: int = 768
    dropout: float = 0.1
    softmax_variant_attn: str = "softmax"
    softmax_variant_output: str = "softmax"
    use_abs_pos_embeddings: bool = False
    use_rotary_embeddings: bool = True
    rope_variant: str = "rope"
    shortrope_length: int = 8
    use_post_ln: bool = True
    use_pre_ln: bool = False
    layernorm_variant: str = "rmsnorm"
    bias: bool = False
    activation_variant: str = "gelu"

def calculate_parameters(config: GPTConfig) -> int:
    # Token embedding parameters (includes vocabulary size in the computation)
    token_embedding_params = config.vocab_size * config.n_embd
    
    # Positional embedding parameters (if using absolute position embeddings)
    position_embedding_params = config.block_size * config.n_embd if config.use_abs_pos_embeddings else 0
    
    # Transformer parameters (calculating parameters for all layers)
    transformer_params = 0
    for _ in range(config.n_layer):
        # Layer normalization parameters
        ln_params = 2 * config.n_embd * 2  # Two layer norms per block, assuming both scale and bias
        
        # Self-attention parameters
        attn_params = (config.n_embd * 3 * config.n_embd) + (config.n_embd * config.n_embd)  # QKV projections and output projection
        if config.bias:
            attn_params += 4 * config.n_embd  # Adding bias terms for QKV and output projection
        
        # MLP parameters
        mlp_params = (config.n_embd * 4 * config.n_embd) + (4 * config.n_embd * config.n_embd)  # FC and proj layers
        if config.bias:
            mlp_params += 2 * 4 * config.n_embd  # Adding bias terms for FC and proj layers
        
        transformer_params += ln_params + attn_params + mlp_params
    
    # Total parameters (summing up token embedding, positional embedding, and transformer parameters)
    total_params = token_embedding_params + position_embedding_params + transformer_params
    return total_params

# Example usage
config = GPTConfig(n_layer=12, n_embd=768, vocab_size=)  # Example configuration with specified vocabulary size
total_params = calculate_parameters(config)
print(f"Total Parameters: {total_params}")


NameError: name 'n_layer_values' is not defined