In [2]:
import transformers
import torch
from transformers import AutoModelForCausalLM

from IPython.display import Image
import os
os.environ['http_proxy'] = 'http://127.0.0.1:7890'
os.environ['https_proxy'] = 'http://127.0.0.1:7890'

## huggingface 

- attention mechanism （Transformer 最特色的）
    - $X\in\mathbb R^{\ell\times d}$
    - $W_k\in\mathbb R^{d\times d_k},W_q\in\mathbb R^{d\times d_k},W_v\in\mathbb R^{d\times d_v}$
    - $Q=XW_q\in\mathbb R^{\ell\times d_k}, K=XW_k\in\mathbb R^{\ell\times d_k}, V=XW_v\in\mathbb R^{\ell\times d_v}$

$$
\text{Attention}(Q,K,V)=\text{softmax}\left(\frac{QK^T}{\sqrt{d_k}}\right)V
$$

$$
A_{ij}=\frac{\exp(\frac{q^T_ik_j}{\sqrt{d_k}})}{\sum_{j'}\exp(\frac{q^T_ik_{j'}}{\sqrt{d_k}})}
$$


In [6]:
# quality & computation efficiency
Image(url='https://pbs.twimg.com/media/FzjhZk5X0AYAs_-?format=jpg&name=4096x4096', width=500)

```
# self attn
self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
# GQA，llama2: 8*128 = 1024
# GQA，llama3: 32*(4096/32) = 1024
self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=config.attention_bias)

# GQA
# llama2: 32/32
# llama3: 32/8 = 4, 4对1
self.num_key_value_groups = self.num_heads // self.num_key_value_heads

# mlp
self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
```

In [7]:
llama2_id = 'meta-llama/Llama-2-7b-hf'
llama3_id = "meta-llama/Meta-Llama-3-8B"

|     **model**    | **heads** | **layers** | **dim** | **head_dim** |
|------------------|-----------|------------|---------|--------------|
| llama2-7b        | 32        | 32         | 4096    | 4096/32      |
| llama2-13b       | 40        | 40         | 5120    | 5120/40      |
| llama2-70b       | 64        | 80         | 8192    | 8192/64      |

|     **model**    | **heads** | **layers** | **dim** | **head_dim** |
|------------------|-----------|------------|---------|--------------|
| llama3-8b        | 32        | 32         | 4096    | 4096/32      |

-  vocab_size (Embedding):
    - llama2: 32000
    - llama3: 128256
- GQA (k_proj, v_proj)
    - head_dim: hidden_size/num_heads
        - llama2: 4096/32 = 128
        - llama3: 4096/32 = 128
    - llama2: **32***(4096/32) = 4096
    - llama3: 8*128 = 1024 (k_proj, v_proj 可以看到一个 learnable parameters 的一个降低)

In [7]:
# 7b => 14GB
llama2 = AutoModelForCausalLM.from_pretrained(llama2_id, torch_dtype=torch.bfloat16, device_map='auto')

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

We've detected an older driver with an RTX 4000 series GPU. These drivers have issues with P2P. This can affect the multi-gpu inference when using accelerate device_map.Please make sure to update your driver to the latest version which resolves this.


In [9]:
llama2

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head):

In [8]:
llama3 = AutoModelForCausalLM.from_pretrained(llama3_id, torch_dtype=torch.bfloat16, device_map='auto')

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

We've detected an older driver with an RTX 4000 series GPU. These drivers have issues with P2P. This can affect the multi-gpu inference when using accelerate device_map.Please make sure to update your driver to the latest version which resolves this.


In [10]:
llama3

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head)