In [1]:
from IPython.display import Image
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

In [48]:
import os
os.environ['http_proxy'] = 'http://127.0.0.1:7890'
os.environ['https_proxy'] = 'http://127.0.0.1:7890'

- metircs
    - requests per second
    - latency
- techniques
    - kv cache：每一个 transformer layer；
        - efficient management of KV cache is crucial for high-throughout LLM serving
- references
    - https://www.youtube.com/watch?v=5ZlavKF_98U

## KV cache

In [2]:
Image(url='../imgs/para_kvcache.png', width=500)

- 13B * 2 = 26GB
    - B: billion
    - 2: fp16
- kv cache

    $$
    2 \times \text{precision} \times n_{\text{layers}} \times d_{\text{model}} \times \text{seqlen} \times \text{batch}
    $$
  - OPT-30B: $2*2*48*7168*1024*128/(1024^3)=168$ (GB)
      - 7168 = 1024*7
  - batch size 对应着可以同时处理的请求的数量，`# requests`

In [12]:
model_name = 'gpt2-xl'  # 替换为你的模型名称
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

inputs = tokenizer("Hello, how are you?", return_tensors="pt")
outputs = model(**inputs, output_hidden_states=True, output_attentions=True)

In [13]:
kv_cache = outputs.past_key_values
total_kv_cache_size = sum([kv.nelement() * kv.element_size() for layer in kv_cache for kv in layer])

f"KV Cache size: {total_kv_cache_size / (1024 ** 2)} MB"

'KV Cache size: 3.515625 MB'

In [38]:
# 1.5B
sum(p.numel() for p in model.parameters())

1557611200

In [43]:
# model.config

In [24]:
inputs

{'input_ids': tensor([[15496,    11,   703,   389,   345,    30]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}

In [34]:
2 * 4 * 48 * 1600 * 6 * 1 / (1024 * 1024)

3.515625

In [35]:
# (bs, n_head, seq_ken, head_dim)
kv_cache[0][0].shape

torch.Size([1, 25, 6, 64])

### Memory waste in KV cache

In [53]:
# 2048 的 context window
Image(url='../imgs/kv_cache_internal_frag.png', width=500)

## vLLM: Efficient memory management for LLM inference

- inspired by virtual memory & paging
    - https://arxiv.org/pdf/2309.06180
- Illustration of the PagedAttention algorithm, where the attention key and values vectors are stored as **non-contiguous blocks** in the memory.
- details
    - logical vs. physical
        - logical: adjacent，逻辑或者叫虚拟，非物理的；
        - physical：not adjacent
            - allocated on demand
        - block table
    - serving multiple requests
- 何以体现 Efficient memory
    - minimal internal fragmentation
        - 内部内存碎片只存在于 last block of seq (block allocated on demand, block 按需分配)
        - num wasted tokens /  seq < block size
            - sequences: O(100)-O(10000) tokens
            - block size: 16 or 32 tokens

In [54]:
Image(url='../imgs/paging.png', width=400)

In [57]:
Image(url='../imgs/page_attention.gif', width=500)

In [59]:
Image(url='../imgs/page_attention_gen.gif', width=500)

In [60]:
Image(url='../imgs/serving_multiple_req.png', width=400)

### parallel sampling

- dynamic block mapping enables sharing

In [61]:
Image(url='../imgs/multi_outputs.gif', width=500)

In [62]:
Image(url='../imgs/page_attention_gen.gif', width=500)