## 4.3.2 셀프어텐션
* 책의 코드가 동작하도록 원서의 내용에서 일부 코드를 추가했습니다.

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# 하이퍼파라미터 설정
batch_size = 2
seq_len = 10
d_model = 64  # embedding dimension
d_k = 32      # key/query dimension
d_v = 32      # value dimension

# 입력 임베딩 생성 (예시)
input_embeddings = torch.randn(batch_size, seq_len, d_model)

# Weight 행렬들 정의
wQ = nn.Linear(d_model, d_k, bias=False)
wK = nn.Linear(d_model, d_k, bias=False)
wV = nn.Linear(d_model, d_v, bias=False)

# Self-Attention 계산
q = wQ(input_embeddings)  # (batch_size, seq_len, d_k)
k = wK(input_embeddings)  # (batch_size, seq_len, d_k)
v = wV(input_embeddings)  # (batch_size, seq_len, d_v)

print(f"Query shape: {q.shape}")
print(f"Key shape: {k.shape}")
print(f"Value shape: {v.shape}")

# Attention 점수 계산
dim_k = k.size(-1)
attn_scores = torch.matmul(q, k.transpose(-2, -1))  # (batch_size, seq_len, seq_len)

# 스케일링
scaled_attn_scores = attn_scores / torch.sqrt(torch.tensor(dim_k, dtype=torch.float32))

# Softmax 적용
normalized_attn_scores = F.softmax(scaled_attn_scores, dim=-1)

# 최종 출력 계산
output = torch.matmul(normalized_attn_scores, v)  # (batch_size, seq_len, d_v)

print(f"\nAttention scores shape: {attn_scores.shape}")
print(f"Normalized attention scores shape: {normalized_attn_scores.shape}")
print(f"Output shape: {output.shape}")

# 첫 번째 배치의 attention 가중치 시각화
print(f"\nFirst batch attention weights (first 5x5):")
print(normalized_attn_scores[0, :5, :5])

Query shape: torch.Size([2, 10, 32])
Key shape: torch.Size([2, 10, 32])
Value shape: torch.Size([2, 10, 32])

Attention scores shape: torch.Size([2, 10, 10])
Normalized attention scores shape: torch.Size([2, 10, 10])
Output shape: torch.Size([2, 10, 32])

First batch attention weights (first 5x5):
tensor([[0.1106, 0.1001, 0.0810, 0.1316, 0.0859],
        [0.0841, 0.0662, 0.1048, 0.0934, 0.2182],
        [0.0976, 0.0885, 0.1467, 0.1015, 0.0777],
        [0.0847, 0.1341, 0.1221, 0.1194, 0.1067],
        [0.0815, 0.0739, 0.1473, 0.0705, 0.0844]], grad_fn=<SliceBackward0>)
