In [1]:
import transformers
import torch
from transformers import AutoModelForCausalLM

from IPython.display import Image
import os
os.environ['http_proxy'] = 'http://127.0.0.1:7890'
os.environ['https_proxy'] = 'http://127.0.0.1:7890'

## llama meta vs. hf


- llama2-7b
    - https://huggingface.co/meta-llama/Llama-2-7b/tree/main
        - `checkpoint = torch.load(ckpt_path, map_location="cpu")`
        - `model = Transformer(model_args)`
        - `model.load_state_dict(checkpoint, strict=False)`
        - 自动地走 `bfloat16`（如果支持的话）
    - https://huggingface.co/meta-llama/Llama-2-7b-hf/tree/main
        - `pytorch_model.bin.index.json`
- https://github.com/meta-llama/llama3/blob/main/MODEL_CARD.md

## llama2-7b vs. llama3-8b (hf)

In [2]:
llama2_id = 'meta-llama/Llama-2-7b-hf'
llama3_id = "meta-llama/Meta-Llama-3-8B"

In [3]:
# bfloat16: 2 byte
# 7b => 14GB
llama2 = AutoModelForCausalLM.from_pretrained(llama2_id, torch_dtype=torch.bfloat16, device_map='auto')

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

We've detected an older driver with an RTX 4000 series GPU. These drivers have issues with P2P. This can affect the multi-gpu inference when using accelerate device_map.Please make sure to update your driver to the latest version which resolves this.


In [5]:
# for name, para in llama2.named_parameters():
#     print(name, para.device)

In [6]:
# 8b => 16GB 
llama3 = AutoModelForCausalLM.from_pretrained(llama3_id, torch_dtype=torch.bfloat16, device_map='auto')

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

We've detected an older driver with an RTX 4000 series GPU. These drivers have issues with P2P. This can affect the multi-gpu inference when using accelerate device_map.Please make sure to update your driver to the latest version which resolves this.


### `device_map='auto'`

- 自动地模型并行；

```
if device_map != "sequential":
    # Compute a `max_memory` dictionary for [`infer_auto_device_map`] that will balance the use of each available GPU.
    max_memory = get_balanced_memory(
        model,
        dtype=target_dtype,
        low_zero=(device_map == "balanced_low_0"),
        max_memory=max_memory,
        **device_map_kwargs,
    )
```

### learnable parameters

In [10]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_params = 0
    for _, param in model.named_parameters():
        all_params += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(f"trainable params: {trainable_params} || all params: {all_params} || trainable%: {100 * trainable_params / all_params}")

In [11]:
print_trainable_parameters(llama2)

trainable params: 6738415616 || all params: 6738415616 || trainable%: 100.0


In [12]:
print_trainable_parameters(llama3)

trainable params: 8030261248 || all params: 8030261248 || trainable%: 100.0


In [13]:
llama2

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head):

In [14]:
llama3

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head)

In [27]:
f'{llama2.model.embed_tokens.weight.numel():,}', f'{llama3.model.embed_tokens.weight.numel():,}'

('131,072,000', '525,336,576')

In [29]:
11008/4096, 14336/4096

(2.6875, 3.5)

- attention mechanism （Transformer 最特色的）
    - $X\in\mathbb R^{\ell\times d}$
    - $W_k\in\mathbb R^{d\times d_k},W_q\in\mathbb R^{d\times d_k},W_v\in\mathbb R^{d\times d_v}$
    - $Q=XW_q\in\mathbb R^{\ell\times d_k}, K=XW_k\in\mathbb R^{\ell\times d_k}, V=XW_v\in\mathbb R^{\ell\times d_v}$

$$
\text{Attention}(Q,K,V)=\text{softmax}\left(\frac{QK^T}{\sqrt{d_k}}\right)V
$$

$$
A_{ij}=\frac{\exp(\frac{q^T_ik_j}{\sqrt{d_k}})}{\sum_{j'}\exp(\frac{q^T_ik_{j'}}{\sqrt{d_k}})}
$$


In [6]:
# quality & computation efficiency
Image(url='https://pbs.twimg.com/media/FzjhZk5X0AYAs_-?format=jpg&name=4096x4096', width=500)

In [30]:
(4096/32)

128.0

```
# self attn
self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
# GQA，llama2: 32*(4096/32) = 4096
# GQA，llama3: 8*(4096/32) = 1024
self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=config.attention_bias)

# GQA
# llama2: 32/32
# llama3: 32/8 = 4, 4对1
self.num_key_value_groups = self.num_heads // self.num_key_value_heads

# mlp
self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)

# mlp forward
# hf
down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
# meta
self.w2(F.silu(self.w1(x)) * self.w3(x))
```

|     **model**    | **heads** | **layers** | **dim** | **head_dim** |
|------------------|-----------|------------|---------|--------------|
| llama2-7b        | 32        | 32         | 4096    | 4096/32      |
| llama2-13b       | 40        | 40         | 5120    | 5120/40      |
| llama2-70b       | 64        | 80         | 8192    | 8192/64      |

|     **model**    | **heads** | **layers** | **dim** | **head_dim** |
|------------------|-----------|------------|---------|--------------|
| llama3-8b        | 32        | 32         | 4096    | 4096/32      |

-  vocab_size (Embedding):
    - llama2: 32000
    - llama3: 128256
- GQA (k_proj, v_proj)
    - head_dim: hidden_size/num_heads
        - llama2: 4096/32 = 128
        - llama3: 4096/32 = 128
    - llama2: **32***(4096/32) = 4096
    - llama3: 8*128 = 1024 (k_proj, v_proj 可以看到一个 learnable parameters 的一个降低)