In [13]:
from IPython.display import Image
import numpy as np
import time
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import os
os.environ['http_proxy'] = 'http://127.0.0.1:7890'
os.environ['https_proxy'] = 'http://127.0.0.1:7890'
np.set_printoptions(precision=3)

- attention mechanism （Transformer 最特色的）
    - $X\in\mathbb R^{\ell\times d}$
    - $W_k\in\mathbb R^{d\times d_k},W_q\in\mathbb R^{d\times d_k},W_v\in\mathbb R^{d\times d_v}$
    - $Q=XW_q\in\mathbb R^{\ell\times d_k}, K=XW_k\in\mathbb R^{\ell\times d_k}, V=XW_v\in\mathbb R^{\ell\times d_v}$

$$
\left(\text{Attention}(Q,K,V)=\text{softmax}\left(\frac{QK^T}{\sqrt{d_k}}\right)V\right)\in \mathbb R^{\ell\times d_v}
$$


In [3]:
Image(url='https://miro.medium.com/v2/resize:fit:828/format:webp/1*uyuyOW1VBqmF5Gtv225XHQ.gif', width=500)

- KV cache 会显著地提升 inference/generate 的性能，降低时延；
- generate 的 seq 越长，占用的显存增长得也会更多；
    - gpt 8K vs. 32k, input/output prices 是翻倍的关系
- KV-cache Memory Usage

    $$
    2 \times \text{precision} \times n_{\text{layers}} \times d_{\text{model}} \times \text{seqlen} \times \text{batch}
    $$
    
    - 2 = two matrices for K and V
    - precision = bytes per parameter (e.g., 4 for fp32)
    - $n_{\text{layers}}$ = layers in the model
    - $d_{\text{model}}$ = dimension of embeddings
    - seqlen = length of context in tokens
    - batch = batch size
    - OPT-30B: $2*2*48*128*1024*7168$
        - precision：2（fp16 inference）
        - 48 layers，128 batch
        - K/V shape: seqlen 1024, d_model 7168 (7*1024)
            - https://github.com/meta-llama/llama3/blob/main/llama/model.py#L129-L144

In [11]:
# KV-cache: 168GB
# Model: 2*30B=60GB
2*2*48*128*1024*7168/(1024*1024*1024)

168.0

In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained("gpt2-xl")
model = AutoModelForCausalLM.from_pretrained("gpt2-xl").to(device)

for use_cache in (True, False):
    times = []
    for _ in range(10):  # measuring 10 generations
        start = time.time()
        model.generate(**tokenizer("What is KV caching?", return_tensors="pt").to(device), 
                       use_cache=use_cache, max_new_tokens=1000)
        times.append(time.time() - start)
    print(f"{'with' if use_cache else 'without'} KV caching: {round(np.mean(times), 3)} +- {round(np.std(times), 3)} seconds")

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/6.43G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


with KV caching: 22.736 +- 0.364 seconds


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


without KV caching: 65.567 +- 0.079 seconds


## encoder-decoder vs. decoder only 

In [2]:
Image(url='../imgs/multi-turn-bi-uni.png', width=500)

- Bidirectional vs. Unidirectional
    - BERT：**Bidirectional** Encoder Representations from Transformers），双向注意力
    - GPT：Unidirectional，单向注意力；
- 以多轮对话为例，从计算复杂度的角度探索为什么 decoder-only 更优
- 定义
    - $L$: past sequence length
    - $\ell$: 新的输入的长度
    - $d$：embedding dimension
- decoder only
    - KVcache: $K_{past}, V_{past}$
    - 每次新输入时，计算键值（$K_{new}, V_{new}$），时间复杂度为 $O(\ell\cdot d)$，也需要计算 Query $Q_{new}$
    - 计算注意力，
        - $Q=Q_{new}\in \mathbb R^{\ell \cdot d}$
        - $K=[K_{past}, K_{new}]\in \mathbb R^{(L+\ell)\cdot d}$
        - $V=[V_{past}, V_{new}]\in \mathbb R^{(L+\ell)\cdot d}$
        - $A=QK^T\in \mathbb R^{\ell\cdot(\ell+L)}$
            - $q_i$ 要跟 $L+i$ 的 K 计算 score vector；
        - $\text{softmax}(A)\cdot V\in \mathbb R^{\ell\cdot d}$
- 对于 encoder-decoder
    - At every turn, the new input has to be **encoded again**; for unidirectional attention, only the newly added message needs to be encoded.

## demo tests

In [14]:
L, l, d = 5, 2, 3
K_past = np.random.randn(L, 3)
V_past = np.random.randn(L, 3)
Q_past = np.random.randn(L, 3)

Q_new = np.random.randn(l, 3)
K_new = np.random.randn(l, 3)
V_new = np.random.randn(l, 3)


In [15]:
def create_custom_matrix(n):
    # 创建一个全为负无穷的矩阵
    matrix = np.full((n, n), -np.inf)
    
    # 将下三角部分（包括对角线）设置为0
    lower_triangle_indices = np.tril_indices(n)
    matrix[lower_triangle_indices] = 0
    
    return matrix

In [16]:
M1 = create_custom_matrix(5)
M1

array([[  0., -inf, -inf, -inf, -inf],
       [  0.,   0., -inf, -inf, -inf],
       [  0.,   0.,   0., -inf, -inf],
       [  0.,   0.,   0.,   0., -inf],
       [  0.,   0.,   0.,   0.,   0.]])

In [17]:
import scipy as sp
sp.special.softmax((Q_past.dot(K_past.T))/np.sqrt(3) + M1, axis=1)

array([[1.   , 0.   , 0.   , 0.   , 0.   ],
       [0.622, 0.378, 0.   , 0.   , 0.   ],
       [0.592, 0.352, 0.056, 0.   , 0.   ],
       [0.629, 0.271, 0.022, 0.079, 0.   ],
       [0.532, 0.147, 0.039, 0.079, 0.203]])

In [18]:
M2 = create_custom_matrix(7)
M2

array([[  0., -inf, -inf, -inf, -inf, -inf, -inf],
       [  0.,   0., -inf, -inf, -inf, -inf, -inf],
       [  0.,   0.,   0., -inf, -inf, -inf, -inf],
       [  0.,   0.,   0.,   0., -inf, -inf, -inf],
       [  0.,   0.,   0.,   0.,   0., -inf, -inf],
       [  0.,   0.,   0.,   0.,   0.,   0., -inf],
       [  0.,   0.,   0.,   0.,   0.,   0.,   0.]])

In [19]:
Q = np.concatenate([Q_past, Q_new], axis=0)
K = np.concatenate([K_past, K_new], axis=0)
sp.special.softmax((Q.dot(K.T))/np.sqrt(3) + M2, axis=1)

array([[1.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ],
       [0.622, 0.378, 0.   , 0.   , 0.   , 0.   , 0.   ],
       [0.592, 0.352, 0.056, 0.   , 0.   , 0.   , 0.   ],
       [0.629, 0.271, 0.022, 0.079, 0.   , 0.   , 0.   ],
       [0.532, 0.147, 0.039, 0.079, 0.203, 0.   , 0.   ],
       [0.245, 0.136, 0.156, 0.119, 0.122, 0.222, 0.   ],
       [0.162, 0.211, 0.233, 0.112, 0.13 , 0.079, 0.072]])

In [20]:
import scipy as sp
sp.special.softmax((Q_past.dot(K_past.T))/np.sqrt(3), axis=1)

array([[0.353, 0.169, 0.114, 0.157, 0.206],
       [0.218, 0.132, 0.537, 0.067, 0.046],
       [0.287, 0.171, 0.027, 0.141, 0.374],
       [0.443, 0.191, 0.015, 0.055, 0.296],
       [0.532, 0.147, 0.039, 0.079, 0.203]])

In [21]:
sp.special.softmax((Q.dot(K.T))/np.sqrt(3), axis=1)

array([[0.227, 0.109, 0.074, 0.101, 0.132, 0.148, 0.21 ],
       [0.159, 0.097, 0.393, 0.049, 0.033, 0.188, 0.081],
       [0.229, 0.137, 0.022, 0.113, 0.3  , 0.044, 0.156],
       [0.406, 0.175, 0.014, 0.051, 0.272, 0.012, 0.07 ],
       [0.404, 0.112, 0.029, 0.06 , 0.154, 0.063, 0.178],
       [0.201, 0.112, 0.128, 0.097, 0.1  , 0.182, 0.18 ],
       [0.162, 0.211, 0.233, 0.112, 0.13 , 0.079, 0.072]])