In [1]:
import numpy as np

In [2]:
# 4个位置,每个位置3维特征
Feature_all = np.array([
    [1, 0, 1],  # 位置1
    [0, 1, 1],  # 位置2  
    [1, 1, 0],  # 位置3
    [0, 0, 1]   # 位置4
])  # shape: [4, 3]

Feature: [4, 3]    # 4个位置,每个位置3维特征 \
W_q: [3, 2]       # 3维映射到2维 \
Query = Feature @ W_q    # [4, 3] @ [3, 2] = [4, 2]

In [5]:
# 获取输入和输出维度
n_in = 3   # 输入维度
n_out = 2  # 输出维度

# Xavier正态初始化
# std = sqrt(2.0 / (n_in + n_out))
std = np.sqrt(2.0 / (n_in + n_out))

# 初始化三个权重矩阵
W_q = np.random.normal(0, std, (n_in, n_out))
W_k = np.random.normal(0, std, (n_in, n_out))
W_v = np.random.normal(0, std, (n_in, n_out))

In [9]:
print(f"W_q: {W_q}")
print(f"W_k: {W_k}")
print(f"W_v: {W_v}")
print(f"Shape = {W_q.shape}")

W_q: [[ 0.1936747   0.78729825]
 [ 0.54926615 -0.04591318]
 [ 1.62722824  0.39686957]]
W_k: [[ 0.83663886 -0.79360409]
 [-0.42752367 -0.30505612]
 [-0.68283334 -0.00779054]]
W_v: [[-0.51232982 -0.86301211]
 [ 0.74448008 -0.28517045]
 [ 0.27987747  0.29156925]]
Shape = (3, 2)


In [10]:
# 计算Query,Key,Value
Query = Feature_all @ W_q  # [4, 2]
Key = Feature_all @ W_k    # [4, 2]
Value = Feature_all @ W_v  # [4, 2]

print("Query shape:", Query.shape)
print("Query:\n", Query)
# 每行表示一个位置的Query向量

Query shape: (4, 2)
Query:
 [[1.82090294 1.18416782]
 [2.17649439 0.3509564 ]
 [0.74294086 0.74138508]
 [1.62722824 0.39686957]]


In [11]:
# 计算注意力分数
scores = Query @ Key.T  # [4, 4]
print("Attention scores:\n", scores)
# scores[i,j]表示位置i对位置j的注意力分数

# 归一化
def softmax(x):
    exp_x = np.exp(x)
    return exp_x / exp_x.sum(axis=1, keepdims=True)

weights = softmax(scores)
print("Attention weights:\n", weights)
# 每行和为1,表示位置i对所有位置的注意力权重

Attention scores:
 [[-0.66892083 -2.39231529 -0.55603902 -1.25259854]
 [ 0.05350227 -2.52648133  0.50485508 -1.48891708]
 [-0.47987362 -1.05686943 -0.5105819  -0.51308058]
 [-0.06777247 -1.93096359  0.22969898 -1.11421752]]
Attention weights:
 [[0.35016434 0.0624901  0.39200884 0.19533672]
 [0.34964135 0.02649416 0.54908911 0.07477537]
 [0.28582211 0.16051282 0.2771784  0.27648668]
 [0.35053151 0.05439431 0.47197313 0.12310105]]


In [12]:
# 最终输出
output = weights @ Value  # [4, 2]
print("Output:\n", output)
# 每行是一个位置的输出特征,融合了所有位置的信息

Output:
 [[ 0.12829098 -0.59284257]
 [ 0.09426366 -0.80828286]
 [ 0.23971192 -0.3999403 ]
 [ 0.11825924 -0.7059795 ]]


In [13]:
# 位置1的权重分布
print("Position 1 attention weights:", weights[0])
# 可以看到它对各个位置的关注程度

# 位置1的最终特征是如何融合的
print("Position 1 original feature:", Feature_all[0])
print("Position 1 new feature:", output[0])
# 可以看到新特征如何融合了全局信息

Position 1 attention weights: [0.35016434 0.0624901  0.39200884 0.19533672]
Position 1 original feature: [1 0 1]
Position 1 new feature: [ 0.12829098 -0.59284257]


## Multi-Head Self-Attention

In [18]:
# 设置参数
n_heads = 4  # 头数
d_model = 3  # 输入特征维度
d_k = d_v = 2  # 每个头的维度

# 为每个头初始化权重矩阵
# 使用Xavier初始化
std = np.sqrt(2.0 / (d_model + d_k))

# 每个头有自己的W_q, W_k, W_v
W_q_heads = [np.random.normal(0, std, (d_model, d_k)) for _ in range(n_heads)]
W_k_heads = [np.random.normal(0, std, (d_model, d_k)) for _ in range(n_heads)]
W_v_heads = [np.random.normal(0, std, (d_model, d_v)) for _ in range(n_heads)]

W_q_heads = np.array(W_q_heads)
W_k_heads = np.array(W_k_heads)
W_v_heads = np.array(W_v_heads)

In [19]:
print(W_q_heads)
print(W_q_heads.shape)

[[[ 0.2561395   0.17922212]
  [ 0.34622551 -0.72346501]
  [-0.35703322 -0.42980255]]

 [[-0.43398198 -0.1584815 ]
  [-0.83777088 -0.64821551]
  [ 0.26073526  0.07988022]]

 [[-0.00870527  0.69960601]
  [ 0.42938427 -0.06887067]
  [ 0.30984964  0.30917478]]

 [[ 0.50756209 -0.71378229]
  [ 0.11222289 -0.0797183 ]
  [ 0.02702348 -0.33147692]]]
(4, 3, 2)


In [22]:
def attention(Q, K, V):
    scores = Q @ K.T
    weights = np.exp(scores) / np.sum(np.exp(scores), axis=1, keepdims=True)
    return weights @ V

# 计算多头注意力
head_outputs = []
for h in range(n_heads):
    # 每个头的注意力计算
    Q = Feature_all @ W_q_heads[h]  # [4, 2]
    K = Feature_all @ W_k_heads[h]  # [4, 2]
    V = Feature_all @ W_v_heads[h]  # [4, 2]
    
    head_output = attention(Q, K, V)  # [4, 2]
    head_outputs.append(head_output)

head_outputs = np.array(head_outputs)
print(head_outputs)
print(head_outputs.shape)

[[[ 0.46834058  0.46934938]
  [ 0.31235121  0.56094363]
  [ 0.28177941  0.48250133]
  [ 0.53172276  0.48693824]]

 [[ 0.0458458  -1.08443782]
  [ 0.03578854 -1.05753493]
  [ 0.01530534 -1.06665533]
  [ 0.04823435 -1.11904312]]

 [[ 0.92165626 -0.01353054]
  [ 0.80494898  0.11203423]
  [ 0.87002499  0.02854181]
  [ 0.82970115  0.02133007]]

 [[ 0.90416997 -0.12297903]
  [ 1.06334586 -0.22649126]
  [ 0.94761556 -0.15466373]
  [ 1.08919686 -0.24193256]]]
(4, 4, 2)


In [23]:
# 拼接多个头的输出
multi_head_output = np.concatenate(head_outputs, axis=1)  # [4, 8]

# 通常还需要一个最终的线性变换
W_o = np.random.normal(0, std, (n_heads * d_v, d_model))
final_output = multi_head_output @ W_o  # [4, 3]

print("每个头的输出形状:", [h.shape for h in head_outputs])
print("多头拼接后的形状:", multi_head_output.shape)
print("最终输出形状:", final_output.shape)

每个头的输出形状: [(4, 2), (4, 2), (4, 2), (4, 2)]
多头拼接后的形状: (4, 8)
最终输出形状: (4, 3)


In [24]:
# 分析位置1的输出
print("\n位置1在各个头的attention weights:")
for h in range(n_heads):
    Q = Feature_all @ W_q_heads[h]
    K = Feature_all @ W_k_heads[h]
    scores = Q @ K.T
    weights = np.exp(scores) / np.sum(np.exp(scores), axis=1, keepdims=True)
    print(f"Head {h+1}:", weights[0])  # 位置1的权重


位置1在各个头的attention weights:
Head 1: [0.26914389 0.29401485 0.19324804 0.24359321]
Head 2: [0.22621923 0.28882462 0.21266388 0.27229227]
Head 3: [0.31554975 0.14753084 0.26249605 0.27442336]
Head 4: [0.12891289 0.2111176  0.54042794 0.11954157]
