In [35]:
from IPython.display import Image
import torch
import math

## bert => gpt

In [2]:
Image(url='https://miro.medium.com/v2/resize:fit:786/format:webp/1*iJqlhZz-g6ZQJ53-rE9VvA.png', width=500)

- BERT: 加性 (absolue) position encoding （learnable position encoding）

    ```
    # modeling_bert.py
    embeddings = inputs_embeds + token_type_embeddings + self.position_embeddings(position_ids)
    ```

- GPT: 加性 （absolute）position encoding（learnable position encoding）

    ```
    # modeling_gpt.py
    if inputs_embeds is None:
        inputs_embeds = self.wte(input_ids)
    position_embeds = self.wpe(position_ids)
    hidden_states = inputs_embeds + position_embeds + token_type_embeds
    ```

### sin position encoding

- 无需训练; 依然是绝对位置编码
- transformers 原始论文

$$
\begin{split}
PE(t,2i)&=\sin(\frac{t}{10000^{\frac{2i}{d_{model}}}})\\
PE(t,2i+1)&=\cos(\frac{t}{10000^{\frac{2i}{d_{model}}}})\\
\Downarrow\\
PE(t,i)&=\sin(\frac{t}{10000^{\frac{i}{d_{model}}}}), \quad \text{i is even}\\
PE(t,i)&=\cos(\frac{t}{10000^{\frac{i-1}{d_{model}}}}), \quad \text{i is odd}\\
\end{split}
$$

d_model = 4

- pos = 0, $[\sin(0),\cos(0), \sin(0),\cos(0)]$
- pos = 1, $[\sin\left(\frac{1}{10000^{0/4}}\right),\cos\left(\frac{1}{10000^{0/4}}\right), \sin\left(\frac{1}{10000^{2/4}}\right), \cos\left(\frac{1}{10000^{2/4}}\right)]$
- pos = 2, $[\sin\left(\frac{2}{10000^{0/4}}\right),\cos\left(\frac{2}{10000^{0/4}}\right), \sin\left(\frac{2}{10000^{2/4}}\right), \cos\left(\frac{2}{10000^{2/4}}\right)]$
- pos = 3, $[\sin\left(\frac{3}{10000^{0/4}}\right),\cos\left(\frac{3}{10000^{0/4}}\right), \sin\left(\frac{3}{10000^{2/4}}\right), \cos\left(\frac{3}{10000^{2/4}}\right)]$




## llama RoPE

- 从绝对位置编码到相对位置编码
    - 绝对位置编码，位置 pos_i 的编码仅取决于 pos_i 的值；
    - 相对位置编码，（一般不需要对每个位置进行单独的编码），而是直接对位置之间的相对距离进行编码
        - pos=0 与 pos=1 的相对位置 $f(|0-1|)$
        - pos=1 与 pos=3 的相对位置 $f(|1-3|)$
        - 偏差构成的矩阵，称为 id 矩阵；
- RoPE
    - 旋转位置编码，为相对位置编码，非加性位置编码，直接嵌入到 attention mechanism 的计算中；


$$
\begin{split}
f(q,m)^Tf(k,n)&=(R_mq)^T(R_nk)\\
&=q^T(R^T_mR_n)k\\
&=q^TR_{n-m}k
\end{split}
$$


```
# freqs_cis 是一个全局的旋转矩阵
xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)
xq, xk = apply_rotary_emb(xq, xk, freqs_cis=freqs_cis)
    # xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)
    # xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)
xq, xk, xv
```

In [33]:
# d: dim, 
# m: position
Image(url='../../imgs/rope_paper.png', width=600)

In [4]:
def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0):
    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
    t = torch.arange(end)  # type: ignore
    freqs = torch.outer(t, freqs).float()  # type: ignore
    freqs_cis = torch.polar(torch.ones_like(freqs), freqs)  # complex64
    return freqs_cis

In [13]:
cis = precompute_freqs_cis(dim=4, end=3)
cis

tensor([[ 1.0000+0.0000j,  1.0000+0.0000j],
        [ 0.5403+0.8415j,  0.9999+0.0100j],
        [-0.4161+0.9093j,  0.9998+0.0200j]])

$$
\begin{split}
&\text{freqs}=[1,\frac{1}{\theta^{2/4}}]=[1., 0.01]\\
&t=[0,1,2]\\
&\text{freqs}=\begin{bmatrix}
0 & 0\\
1 & 0.01\\
2 & 0.02
\end{bmatrix}\\
&\text{freqs\_cis}=e^{j\cdot\text{freqs}}=\begin{bmatrix}
1 & 1\\
e^j & e^{j\cdot0.01}\\
e^{j\cdot2} & e^{j\cdot0.02}
\end{bmatrix}
\end{split}
$$

In [21]:
# torch.polar(torch.tensor([1.]), torch.tensor([0.]))
torch.polar(torch.tensor([1.]), torch.tensor([2.]))

tensor([-0.4161+0.9093j])

In [10]:
theta_matrix = torch.tensor([
    [0.0, 0.0],
    [1.0, 0.01],
    [2.0, 0.02]
])

# 幅度矩阵，全为 1
r_matrix = torch.ones_like(theta_matrix)

# 计算 e^{j*theta_matrix}
e_j_theta_matrix = torch.polar(r_matrix, theta_matrix)
e_j_theta_matrix

tensor([[ 1.0000+0.0000j,  1.0000+0.0000j],
        [ 0.5403+0.8415j,  0.9999+0.0100j],
        [-0.4161+0.9093j,  0.9998+0.0200j]])

### https://spaces.ac.cn/archives/8265

In [36]:
def get_rope_matrix(d, theta):
    """Construct the ROPE rotation matrix."""
    cos_theta = torch.cos(theta)
    sin_theta = torch.sin(theta)
    mat = torch.zeros(d, d)
    mat[0:d//2, 0:d//2] = torch.diag(cos_theta)
    mat[d//2:, d//2:] = torch.diag(cos_theta)
    mat[0:d//2, d//2:] = -torch.diag(sin_theta)
    mat[d//2:, 0:d//2] = torch.diag(sin_theta)
    return mat

In [37]:
# 示例向量的维度
d = 4  # 必须是偶数
theta_m = torch.tensor([i * math.pi / 180 for i in range(d//2)])
theta_n = torch.tensor([(i + 1) * math.pi / 180 for i in range(d//2)])

In [38]:
# 构造 R_m 和 R_n
R_m = get_rope_matrix(d, theta_m)
R_n = get_rope_matrix(d, theta_n)

In [42]:
R_m_T = R_m.T
R_n_minus_m = get_rope_matrix(d, theta_n - theta_m)
R_n_minus_m

tensor([[ 0.9998,  0.0000, -0.0175, -0.0000],
        [ 0.0000,  0.9998, -0.0000, -0.0175],
        [ 0.0175,  0.0000,  0.9998,  0.0000],
        [ 0.0000,  0.0175,  0.0000,  0.9998]])

In [41]:
product = torch.mm(R_m_T, R_n)
product

tensor([[ 0.9998,  0.0000, -0.0175,  0.0000],
        [ 0.0000,  0.9998,  0.0000, -0.0175],
        [ 0.0175,  0.0000,  0.9998,  0.0000],
        [ 0.0000,  0.0175,  0.0000,  0.9998]])

In [43]:
# 测试向量 q 和 k
q = torch.tensor([1.0, 2.0, 3.0, 4.0])
k = torch.tensor([4.0, 3.0, 2.0, 1.0])

# 计算 (R_m q)^T (R_n k)
R_m_q = torch.mv(R_m, q)
R_n_k = torch.mv(R_n, k)
# result_1 = torch.dot(R_m_q, R_n_k)

In [46]:
# 计算 q^T R_m^T R_n k
result_2 = torch.dot(q, torch.mv(product, k))
result_2

tensor(20.3460)

In [47]:
# 计算 q^T R_{n-m} k
result_3 = torch.dot(q, torch.mv(R_n_minus_m, k))
result_3

tensor(20.3460)

## cope