# Self-Attention

In [1]:
import torch

# 입력 벡터 시퀀스 X
x = torch.tensor([
    [1.0, 0.0, 1.0, 0.0],
    [0.0, 2.0, 0.0, 2.0],
    [1.0, 1.0, 1.0, 1.0]
])

# Q 가중치
w_query = torch.tensor([
  [1.0, 0.0, 1.0],
  [1.0, 0.0, 0.0],
  [0.0, 0.0, 1.0],
  [0.0, 1.0, 1.0]
])

# K 가중치
w_key = torch.tensor([
  [0.0, 0.0, 1.0],
  [1.0, 1.0, 0.0],
  [0.0, 1.0, 0.0],
  [1.0, 1.0, 0.0]
])

# V 가중치
w_value = torch.tensor([
  [0.0, 2.0, 0.0],
  [0.0, 3.0, 0.0],
  [1.0, 0.0, 3.0],
  [1.0, 1.0, 0.0]
])

In [None]:
# 실제 모델에서는 입력 벡터 시퀀스 Pre-LayerNorm 적용
# x = layer_norm(x)
# 초기 논문은 Post-LayerNorm 구조였음 Attention(Q,K,V) + X → LayerNorm

querys = torch.mm(x, w_query)
keys = torch.mm(x, w_key)
values = torch.mm(x, w_value)

querys, keys, values

(tensor([[1., 0., 2.],
         [2., 2., 2.],
         [2., 1., 3.]]),
 tensor([[0., 1., 1.],
         [4., 4., 0.],
         [2., 3., 1.]]),
 tensor([[1., 2., 3.],
         [2., 8., 0.],
         [2., 6., 3.]]))

In [4]:
attn_scores = torch.matmul(querys, keys.T)

attn_scores

tensor([[ 2.,  4.,  4.],
        [ 4., 16., 12.],
        [ 4., 12., 10.]])

In [12]:
from torch.nn.functional import softmax

key_dim_sqrt = torch.sqrt(torch.tensor(keys.shape[-1]))
key_dim_sqrt

tensor(1.7321)

In [14]:
attn_scores_softmax = softmax(attn_scores / key_dim_sqrt, dim=-1)
attn_scores_softmax

tensor([[1.3613e-01, 4.3194e-01, 4.3194e-01],
        [8.9045e-04, 9.0884e-01, 9.0267e-02],
        [7.4449e-03, 7.5471e-01, 2.3785e-01]])

In [15]:
weighted_values = torch.matmul(attn_scores_softmax, values)
weighted_values

tensor([[1.8639, 6.3194, 1.7042],
        [1.9991, 7.8141, 0.2735],
        [1.9926, 7.4796, 0.7359]])

---
---
# Multi-Head Attention

In [None]:
import torch
from torch.nn.functional import softmax

torch.set_printoptions(precision=4, sci_mode=False)

# 입력 (T=3(시퀀스 길이), d_model=4(임베딩 차원))
x = torch.tensor([
    [1.0, 0.0, 1.0, 0.0],
    [0.0, 2.0, 0.0, 2.0],
    [1.0, 1.0, 1.0, 1.0]
])

# 멀티헤드 설정
T, d_model = x.shape
num_heads = 2
head_dim = d_model // num_heads   # 2
assert d_model % num_heads == 0

# (예제용) Q/K/V 가중치: (d_model, d_model)
# 실제 구현에서는 nn.Linear(d_model, d_model) 3개로 만드는 경우가 많습니다.
# self.q_proj = nn.Linear(d_model, d_model)
# self.k_proj = nn.Linear(d_model, d_model)
# self.v_proj = nn.Linear(d_model, d_model)

# head별 Q, K, V projection을 하나로 합친 행렬 (2 heads × 2-dim(각 head의 임베딩 차원) = d_mode(토큰 임베딩 차원))
W_Q = torch.tensor([
    [1.0, 0.0, 1.0, 0.0],
    [1.0, 0.0, 0.0, 1.0],
    [0.0, 1.0, 1.0, 0.0],
    [0.0, 1.0, 0.0, 1.0]
])

W_K = torch.tensor([
    [0.0, 1.0, 0.0, 1.0],
    [1.0, 0.0, 1.0, 0.0],
    [0.0, 1.0, 1.0, 0.0],
    [1.0, 0.0, 0.0, 1.0]
])

W_V = torch.tensor([
    [1.0, 0.0, 2.0, 0.0],
    [0.0, 1.0, 0.0, 2.0],
    [1.0, 0.0, 0.0, 1.0],
    [0.0, 1.0, 1.0, 0.0]
])

# 출력 투영 W_O: (d_model, d_model)
# 예제에서는 단순히 섞임이 보이도록 임의 값 사용(혹은 torch.eye(4)로 identity도 가능)
W_O = torch.tensor([
    [1.0, 0.0, 0.5, 0.0],
    [0.0, 1.0, 0.0, 0.5],
    [0.5, 0.0, 1.0, 0.0],
    [0.0, 0.5, 0.0, 1.0]
])

In [35]:
# 1) Q, K, V 만들기: (T, d_model)
Q = x @ W_Q
K = x @ W_K
V = x @ W_V

Q, K, V

(tensor([[1., 1., 2., 0.],
         [2., 2., 0., 4.],
         [2., 2., 2., 2.]]),
 tensor([[0., 2., 1., 1.],
         [4., 0., 2., 2.],
         [2., 2., 2., 2.]]),
 tensor([[2., 0., 2., 1.],
         [0., 4., 2., 4.],
         [2., 2., 3., 3.]]))

In [39]:
# 2) 헤드로 reshape: (T, num_heads, head_dim)
Qh = Q.view(T, num_heads, head_dim)
Kh = K.view(T, num_heads, head_dim)
Vh = V.view(T, num_heads, head_dim)

print(f'Qh: {Qh}, \n\n Kh: {Kh}, \n\n Vh: {Vh}')

Qh: tensor([[[1., 1.],
         [2., 0.]],

        [[2., 2.],
         [0., 4.]],

        [[2., 2.],
         [2., 2.]]]), 

 Kh: tensor([[[0., 2.],
         [1., 1.]],

        [[4., 0.],
         [2., 2.]],

        [[2., 2.],
         [2., 2.]]]), 

 Vh: tensor([[[2., 0.],
         [2., 1.]],

        [[0., 4.],
         [2., 4.]],

        [[2., 2.],
         [3., 3.]]])


In [50]:
Qh[:, 1, :], Kh[:, 1, :].T, Qh[:, 1, :] @ Kh[:, 1, :].T

(tensor([[2., 0.],
         [0., 4.],
         [2., 2.]]),
 tensor([[1., 2., 2.],
         [1., 2., 2.]]),
 tensor([[2., 4., 4.],
         [4., 8., 8.],
         [4., 8., 8.]]))

In [None]:
# 3) 헤드별 어텐션 스코어: (num_heads, T, T)

# head마다 Qh[:,h,:] @ Kh[:,h,:].T
# scores = torch.zeros(num_heads, T, T)
# for h in range(num_heads):
#     scores[h] = Qh[:, h, :] @ Kh[:, h, :].T # (3, 2) @ (2, 3)

# 내부적으로는 위 코드가 작동됨.
scores = torch.einsum("thd,shd->hts", Qh, Kh)
scores

tensor([[[2., 4., 4.],
         [4., 8., 8.],
         [4., 8., 8.]],

        [[2., 4., 4.],
         [4., 8., 8.],
         [4., 8., 8.]]])

In [41]:
# 스케일링: / sqrt(head_dim)
scale = torch.sqrt(torch.tensor(head_dim, dtype=torch.float32))
attn = softmax(scores / scale, dim=-1)   # (h, T, T)
attn

tensor([[[0.1084, 0.4458, 0.4458],
         [0.0287, 0.4856, 0.4856],
         [0.0287, 0.4856, 0.4856]],

        [[0.1084, 0.4458, 0.4458],
         [0.0287, 0.4856, 0.4856],
         [0.0287, 0.4856, 0.4856]]])

In [42]:
# 4) 헤드별 weighted sum: (num_heads, T, head_dim)
out_heads = torch.einsum("hts,shd->thd", attn, Vh)  # (T, h, d_head)
out_heads

tensor([[[1.1084, 2.6748],
         [2.4458, 3.2290]],

        [[1.0287, 2.9139],
         [2.4856, 3.4282]],

        [[1.0287, 2.9139],
         [2.4856, 3.4282]]])

In [44]:
# 5) concat: (T, d_model)
out_concat = out_heads.reshape(T, d_model)
out_concat

tensor([[1.1084, 2.6748, 2.4458, 3.2290],
        [1.0287, 2.9139, 2.4856, 3.4282],
        [1.0287, 2.9139, 2.4856, 3.4282]])

In [45]:
# 6) 출력 투영: (T, d_model)
out = out_concat @ W_O
out

tensor([[2.3313, 4.2894, 3.0000, 4.5665],
        [2.2715, 4.6280, 3.0000, 4.8852],
        [2.2715, 4.6280, 3.0000, 4.8852]])

---
---
# Masked Multi-Head Attention

In [51]:
import torch
from torch.nn.functional import softmax

torch.set_printoptions(precision=4, sci_mode=False)

# 입력 (T=3, d_model=4)
x = torch.tensor([
    [1.0, 0.0, 1.0, 0.0],
    [0.0, 2.0, 0.0, 2.0],
    [1.0, 1.0, 1.0, 1.0]
])

# 멀티헤드 설정
T, d_model = x.shape
num_heads = 2
head_dim = d_model // num_heads
assert d_model % num_heads == 0

# (예제용) Q/K/V 가중치: (d_model, d_model)
W_Q = torch.tensor([
    [1.0, 0.0, 1.0, 0.0],
    [1.0, 0.0, 0.0, 1.0],
    [0.0, 1.0, 1.0, 0.0],
    [0.0, 1.0, 0.0, 1.0]
])

W_K = torch.tensor([
    [0.0, 1.0, 0.0, 1.0],
    [1.0, 0.0, 1.0, 0.0],
    [0.0, 1.0, 1.0, 0.0],
    [1.0, 0.0, 0.0, 1.0]
])

W_V = torch.tensor([
    [1.0, 0.0, 2.0, 0.0],
    [0.0, 1.0, 0.0, 2.0],
    [1.0, 0.0, 0.0, 1.0],
    [0.0, 1.0, 1.0, 0.0]
])

# 출력 투영 W_O: (d_model, d_model)
W_O = torch.tensor([
    [1.0, 0.0, 0.5, 0.0],
    [0.0, 1.0, 0.0, 0.5],
    [0.5, 0.0, 1.0, 0.0],
    [0.0, 0.5, 0.0, 1.0]
])

In [52]:
# 1) Q, K, V 만들기: (T, d_model)
Q = x @ W_Q
K = x @ W_K
V = x @ W_V

Q, K, V

(tensor([[1., 1., 2., 0.],
         [2., 2., 0., 4.],
         [2., 2., 2., 2.]]),
 tensor([[0., 2., 1., 1.],
         [4., 0., 2., 2.],
         [2., 2., 2., 2.]]),
 tensor([[2., 0., 2., 1.],
         [0., 4., 2., 4.],
         [2., 2., 3., 3.]]))

In [53]:
# 2) 헤드로 reshape: (T, h, d_head)
Qh = Q.view(T, num_heads, head_dim)
Kh = K.view(T, num_heads, head_dim)
Vh = V.view(T, num_heads, head_dim)

print(f'Qh: {Qh}, \n\n Kh: {Kh}, \n\n Vh: {Vh}')

Qh: tensor([[[1., 1.],
         [2., 0.]],

        [[2., 2.],
         [0., 4.]],

        [[2., 2.],
         [2., 2.]]]), 

 Kh: tensor([[[0., 2.],
         [1., 1.]],

        [[4., 0.],
         [2., 2.]],

        [[2., 2.],
         [2., 2.]]]), 

 Vh: tensor([[[2., 0.],
         [2., 1.]],

        [[0., 4.],
         [2., 4.]],

        [[2., 2.],
         [3., 3.]]])


In [54]:
# 3) 헤드별 어텐션 점수: (h, T, T)
scores = torch.einsum("thd,shd->hts", Qh, Kh)
scores

tensor([[[2., 4., 4.],
         [4., 8., 8.],
         [4., 8., 8.]],

        [[2., 4., 4.],
         [4., 8., 8.],
         [4., 8., 8.]]])

In [61]:
# 4) Causal mask 만들기
# mask[t, s] = True면 "가려야 함" (s가 미래: s > t)
causal_mask = torch.triu(torch.ones(T, T, dtype=torch.bool), diagonal=1)  # (T, T)
causal_mask

tensor([[False,  True,  True],
        [False, False,  True],
        [False, False, False]])

In [62]:
# scores에 적용하려면 (h, T, T)로 broadcast되게 (1, T, T)로 확장
scores_masked = scores.masked_fill(causal_mask.unsqueeze(0), float("-inf"))
scores_masked

tensor([[[2., -inf, -inf],
         [4., 8., -inf],
         [4., 8., 8.]],

        [[2., -inf, -inf],
         [4., 8., -inf],
         [4., 8., 8.]]])

In [63]:
# 5) softmax + weighted sum
scale = torch.sqrt(torch.tensor(head_dim, dtype=torch.float32))
attn = softmax(scores_masked / scale, dim=-1)  # (h, T, T)
attn

tensor([[[1.0000, 0.0000, 0.0000],
         [0.0558, 0.9442, 0.0000],
         [0.0287, 0.4856, 0.4856]],

        [[1.0000, 0.0000, 0.0000],
         [0.0558, 0.9442, 0.0000],
         [0.0287, 0.4856, 0.4856]]])

In [64]:
out_heads = torch.einsum("hts,shd->thd", attn, Vh)  # (T, h, d_head)
out_heads

tensor([[[2.0000, 0.0000],
         [2.0000, 1.0000]],

        [[0.1116, 3.7768],
         [2.0000, 3.8326]],

        [[1.0287, 2.9139],
         [2.4856, 3.4282]]])

In [66]:
# 6) concat + output projection
out_concat = out_heads.reshape(T, d_model)
out_concat

tensor([[2.0000, 0.0000, 2.0000, 1.0000],
        [0.1116, 3.7768, 2.0000, 3.8326],
        [1.0287, 2.9139, 2.4856, 3.4282]])

In [None]:
out = out_concat @ W_O
out

tensor([[3.0000, 0.5000, 3.0000, 1.0000],
        [1.1116, 5.6931, 2.0558, 5.7210],
        [2.2715, 4.6280, 3.0000, 4.8852]])

---
---
<br>

| 정규화                    | 평균·분산을 내는 축              |
| ---------------------- | ------------------------ |
| **LayerNorm (층 정규화)**  | **feature 축** (`dim=-1`) |
| **BatchNorm (배치 정규화)** | **batch 축** (`dim=0`)    |


# LayerNorm

입력 벡터 $(x \in \mathbb{R}^{d}$) (한 샘플의 feature 벡터)에 대해

### 평균
$$
\mu = \frac{1}{d}\sum_{i=1}^{d} x_i
$$

### 분산
$$
\sigma^2 = \frac{1}{d}\sum_{i=1}^{d} (x_i - \mu)^2
$$

### 정규화
$$
\hat{x}_i = \frac{x_i - \mu}{\sqrt{\sigma^2 + \epsilon}}
$$

### Affine 변환
$$
y_i = \gamma_i \hat{x}_i + \beta_i
$$

In [16]:
import torch
input = torch.tensor([[1.0, 2.0, 3.0], [1.0, 1.0, 1.0]])
m = torch.nn.LayerNorm(input.shape[-1])
output = m(input)
output

tensor([[-1.2247,  0.0000,  1.2247],
        [ 0.0000,  0.0000,  0.0000]], grad_fn=<NativeLayerNormBackward0>)

In [None]:
m.weight, m.bias # weight는 γ, bias는 β

(Parameter containing:
 tensor([1., 1., 1.], requires_grad=True),
 Parameter containing:
 tensor([0., 0., 0.], requires_grad=True))

In [None]:
torch.mean(input, dim=-1), torch.std(input, dim=-1, unbiased=False) 
# unbiased=True는 표본분산(N−1)이고, LayerNorm/BatchNorm에서는 unbiased=False(모분산)를 쓴다.
# 지금 들어온 텐서를 그대로 정규화, “모집단 추정” 개념이 아님

(tensor([2., 1.]), tensor([0.8165, 0.0000]))

In [24]:
(input - torch.mean(input, -1).view(-1, 1))  / (torch.std(input,-1, unbiased=False)+ 1e-7).view(-1, 1)

tensor([[-1.2247,  0.0000,  1.2247],
        [ 0.0000,  0.0000,  0.0000]])

# BatchNorm

입력 행렬 ($x \in \mathbb{R}^{N \times C}$)  
($N$: batch size, $C$: feature/channel)

### 평균 (채널별, 배치 기준)
$$
\mu_c = \frac{1}{N}\sum_{n=1}^{N} x_{n,c}
$$

### 분산 (채널별, 배치 기준)
$$
\sigma_c^2 = \frac{1}{N}\sum_{n=1}^{N} (x_{n,c} - \mu_c)^2
$$

### 정규화
$$
\hat{x}_{n,c} = \frac{x_{n,c} - \mu_c}{\sqrt{\sigma_c^2 + \epsilon}}
$$

### Affine 변환
$$
y_{n,c} = \gamma_c \hat{x}_{n,c} + \beta_c
$$

In [27]:
import torch
input = torch.tensor([[1.0, 2.0, 3.0], [1.0, 1.0, 1.0]])
bn = torch.nn.BatchNorm1d(input.shape[-1])
output = bn(input)
output

tensor([[ 0.0000,  1.0000,  1.0000],
        [ 0.0000, -1.0000, -1.0000]], grad_fn=<NativeBatchNormBackward0>)

In [None]:
bn.weight, bn.bias # weight는 γ, bias는 β

(Parameter containing:
 tensor([1., 1., 1.], requires_grad=True),
 Parameter containing:
 tensor([0., 0., 0.], requires_grad=True))

In [30]:
torch.mean(input, dim=0), torch.std(input, dim=0, unbiased=False) 

(tensor([1.0000, 1.5000, 2.0000]), tensor([0.0000, 0.5000, 1.0000]))

In [31]:
(input - torch.mean(input, 0).view(1, -1))  / (torch.std(input, 0, unbiased=False)+ 1e-7).view(1, -1)

tensor([[ 0.0000,  1.0000,  1.0000],
        [ 0.0000, -1.0000, -1.0000]])