In [4]:
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
from accelerate.test_utils.testing import get_backend

device, _, _ = get_backend()  # CUDA/CPU 등 자동 감지
model_id = "gpt2"  # ← GPT-2 small
model = GPT2LMHeadModel.from_pretrained(model_id)
tokenizer = GPT2TokenizerFast.from_pretrained(model_id)



  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [13]:
model.transformer.h[0].attn.c_attn.weight.shape

torch.Size([768, 2304])

In [2]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

# =============================================================================
# 1) GPT2Config: 모델 하이퍼파라미터를 저장하는 클래스
# =============================================================================
class GPT2Config:
    def __init__(
        self,
        vocab_size=50257,
        hidden_size=768,            # 임베딩 및 히든 차원
        n_layer=12,                 # Transformer 블록 개수
        n_head=12,                  # 멀티헤드 어텐션 헤드 수
        max_position_embeddings=1024,
        resid_pdrop=0.1,            # Residual Dropout 확률
        attn_pdrop=0.1,             # Attention Dropout 확률
        embd_pdrop=0.1,             # Embedding Dropout 확률
        layer_norm_epsilon=1e-5
    ):
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.n_layer = n_layer
        self.n_head = n_head
        self.max_position_embeddings = max_position_embeddings
        self.resid_pdrop = resid_pdrop
        self.attn_pdrop = attn_pdrop
        self.embd_pdrop = embd_pdrop
        self.layer_norm_epsilon = layer_norm_epsilon


# =============================================================================
# 2) CausalSelfAttention: GPT-2의 인과적(Self-masked) 멀티헤드 어텐션
# =============================================================================
class CausalSelfAttention(nn.Module):
    def __init__(self, config: GPT2Config):
        super().__init__()
        assert config.hidden_size % config.n_head == 0, \
            "hidden_size must be divisible by n_head."

        self.n_head = config.n_head
        self.head_dim = config.hidden_size // config.n_head
        self.scale = 1 / math.sqrt(self.head_dim)

        # 1×1 Conv1d 형태로 Q, K, V를 동시에 생성
        self.c_attn = nn.Conv1d(
            in_channels=config.hidden_size,
            out_channels=3 * config.hidden_size,
            kernel_size=1,
            bias=True
        )
        # 어텐션 결과를 다시 hidden_size 차원으로 projection
        self.c_proj = nn.Conv1d(
            in_channels=config.hidden_size,
            out_channels=config.hidden_size,
            kernel_size=1,
            bias=True
        )

        self.attn_dropout = nn.Dropout(config.attn_pdrop)
        self.resid_dropout = nn.Dropout(config.resid_pdrop)

        # 인과적 마스킹을 위한 버퍼: (1, 1, max_seq, max_seq)
        # <prefix, prefix, i, j> 쌍 중 i < j인 경우 -inf 마스크
        mask = torch.tril(torch.ones(config.max_position_embeddings, config.max_position_embeddings))
        # mask[i, j] = 1 if i >= j else 0
        self.register_buffer("mask", mask.view(1, 1, config.max_position_embeddings, config.max_position_embeddings))

    def forward(self, x):
        """
        x: (batch_size, seq_len, hidden_size)
        리턴: (batch_size, seq_len, hidden_size)
        """
        B, T, C = x.size()  # batch, time, hidden

        # 1) Conv1d 레이어를 쓰기 위해 (B, C, T)로 차원 변환
        x = x.transpose(1, 2)                   # → (B, hidden_size, T)
        qkv = self.c_attn(x)                    # → (B, 3*hidden_size, T)
        qkv = qkv.reshape(B, 3, C, T)           # → (B, 3, hidden_size, T)
        q, k, v = qkv[:, 0], qkv[:, 1], qkv[:, 2]  # 각각 (B, hidden_size, T)

        # 2) (B, hidden_size, T) → multi-head로 분리
        #    각 head마다 (B, n_head, head_dim, T)
        q = q.view(B, self.n_head, self.head_dim, T)
        k = k.view(B, self.n_head, self.head_dim, T)
        v = v.view(B, self.n_head, self.head_dim, T)

        # 3) 전치해서 (B, n_head, T, head_dim) 형태로 변경 (행렬 곱용)
        q = q.permute(0, 1, 3, 2)  # (B, n_head, T, head_dim)
        k = k.permute(0, 1, 3, 2)  # (B, n_head, T, head_dim)
        v = v.permute(0, 1, 3, 2)  # (B, n_head, T, head_dim)

        # 4) attention score 계산: (B, n_head, T, T)
        attn_scores = torch.matmul(q, k.transpose(-2, -1)) * self.scale

        # 5) 마스킹: 미래 토큰 참조를 막기 위해 하삼각행렬(인과 마스크) 곱함
        #    mask[:, :, :T, :T] → (1,1,T,T)
        attn_scores = attn_scores.masked_fill(self.mask[:, :, :T, :T] == 0, float('-inf'))

        # 6) softmax 후 dropout
        attn_probs = F.softmax(attn_scores, dim=-1)       # (B, n_head, T, T)
        attn_probs = self.attn_dropout(attn_probs)

        # 7) attention 결과: (B, n_head, T, head_dim)
        attn_output = torch.matmul(attn_probs, v)

        # 8) (B, n_head, T, head_dim) → (B, hidden_size, T)
        attn_output = attn_output.permute(0, 1, 3, 2).contiguous()
        attn_output = attn_output.view(B, C, T)

        # 9) projection + dropout → (B, hidden_size, T)
        attn_output = self.c_proj(attn_output)
        attn_output = self.resid_dropout(attn_output)

        # 10) (B, hidden_size, T) → (B, T, hidden_size)
        attn_output = attn_output.transpose(1, 2)

        return attn_output


# =============================================================================
# 3) MLP (Feed-Forward Network)
# =============================================================================
class MLP(nn.Module):
    def __init__(self, config: GPT2Config):
        super().__init__()
        self.c_fc = nn.Conv1d(
            in_channels=config.hidden_size,
            out_channels=4 * config.hidden_size,  # hidden → 4×hidden
            kernel_size=1,
            bias=True
        )
        self.c_proj = nn.Conv1d(
            in_channels=4 * config.hidden_size,
            out_channels=config.hidden_size,
            kernel_size=1,
            bias=True
        )
        self.act = nn.GELU()  # GPT-2의 경우에도 GELU 활성화 사용
        self.dropout = nn.Dropout(config.resid_pdrop)

    def forward(self, x):
        """
        x: (batch_size, seq_len, hidden_size)
        리턴: (batch_size, seq_len, hidden_size)
        """
        B, T, C = x.size()

        # 1) (B, T, hidden) → (B, hidden, T)
        x = x.transpose(1, 2)

        # 2) Fully-connected (1×1 Conv) + GELU → (B, 4*hidden, T)
        x = self.c_fc(x)
        x = self.act(x)

        # 3) 다시 projection: (B, 4*hidden, T) → (B, hidden, T)
        x = self.c_proj(x)
        x = self.dropout(x)

        # 4) (B, hidden, T) → (B, T, hidden)
        x = x.transpose(1, 2)
        return x


# =============================================================================
# 4) TransformerBlock: GPT-2 블록 (Self-Attention + MLP + Residual + LayerNorm)
# =============================================================================
class TransformerBlock(nn.Module):
    def __init__(self, config: GPT2Config):
        super().__init__()
        self.ln_1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_epsilon)
        self.attn = CausalSelfAttention(config)
        self.ln_2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_epsilon)
        self.mlp = MLP(config)

    def forward(self, x):
        """
        x: (batch_size, seq_len, hidden_size)
        리턴: (batch_size, seq_len, hidden_size)
        """
        # 1) 첫 번째 LayerNorm + Self-Attention + Residual
        x_norm1 = self.ln_1(x)
        attn_out = self.attn(x_norm1)
        x = x + attn_out

        # 2) 두 번째 LayerNorm + MLP + Residual
        x_norm2 = self.ln_2(x)
        mlp_out = self.mlp(x_norm2)
        x = x + mlp_out

        return x


# =============================================================================
# 5) GPT2Model: 전체 GPT-2 네트워크
# =============================================================================
class GPT2Model(nn.Module):
    def __init__(self, config: GPT2Config):
        super().__init__()
        self.config = config

        # 1) 토큰 임베딩 (wte) & 위치 임베딩 (wpe)
        self.wte = nn.Embedding(config.vocab_size, config.hidden_size)
        self.wpe = nn.Embedding(config.max_position_embeddings, config.hidden_size)
        self.drop = nn.Dropout(config.embd_pdrop)

        # 2) Transformer 블록을 n_layer만큼 쌓음
        self.h = nn.ModuleList([TransformerBlock(config) for _ in range(config.n_layer)])

        # 3) 마지막 LayerNorm
        self.ln_f = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_epsilon)

        # (선택) 출력 토큰 예측을 위해 lm_head 선언
        # 만약 GPT2Model 단독으로 사용하려면 이 부분 생략 가능
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

        # 가중치 초기화
        self._init_weights()

    def _init_weights(self):
        # 공식 구현과 유사한 방식으로 초기화 (간단히 Normal 분포로만 예시)
        for name, p in self.named_parameters():
            if p.dim() > 1:
                nn.init.normal_(p, mean=0.0, std=0.02)
            else:
                nn.init.zeros_(p)

    def forward(self, input_ids, position_ids=None):
        """
        input_ids: (batch_size, seq_len)  LongTensor (토큰 ID)
        position_ids: (batch_size, seq_len) LongTensor (옵션). None인 경우 0~T-1 자동 생성
        리턴: logits or hidden_states
          - hidden_states: (batch_size, seq_len, hidden_size)
          - logits: (batch_size, seq_len, vocab_size)  ← lm_head 사용 시
        """
        B, T = input_ids.size()
        device = input_ids.device

        # 1) position_ids 생성 (없을 경우 0~T-1)
        if position_ids is None:
            position_ids = torch.arange(T, dtype=torch.long, device=device)
            position_ids = position_ids.unsqueeze(0).expand(B, T)  # (B, T)

        # 2) 토큰 임베딩 + 위치 임베딩 → (B, T, hidden_size)
        token_embeddings = self.wte(input_ids)      # (B, T, hidden)
        pos_embeddings = self.wpe(position_ids)     # (B, T, hidden)
        hidden_states = token_embeddings + pos_embeddings
        hidden_states = self.drop(hidden_states)

        # 3) Transformer 블록 순차 적용
        for block in self.h:
            hidden_states = block(hidden_states)    # (B, T, hidden)

        # 4) 최종 LayerNorm
        hidden_states = self.ln_f(hidden_states)    # (B, T, hidden)

        # 5) lm_head를 거쳐서 최종 logits 반환 (옵션)
        logits = self.lm_head(hidden_states)        # (B, T, vocab_size)
        return logits


# =============================================================================
# 6) 사용 예시
# =============================================================================
if __name__ == "__main__":
    # 1) config 정의
    config = GPT2Config(
        vocab_size=50257,
        hidden_size=768,
        n_layer=12,
        n_head=12,
        max_position_embeddings=1024
    )
    # 2) 모델 생성
    model = GPT2Model(config)

    # 3) 더미 입력 (batch_size=2, seq_len=10)
    dummy_input = torch.randint(0, config.vocab_size, (2, 10), dtype=torch.long)
    dummy_position = None  # None이면 forward 내부에서 자동 생성

    # 4) 순방향 실행
    logits = model(dummy_input, dummy_position)  # (2, 10, 50257)
    print("Logits size:", logits.size())         # → torch.Size([2, 10, 50257])




Logits size: torch.Size([2, 10, 50257])


In [4]:
sample_block = TransformerBlock(config=config)

In [None]:
sample_block.ln_2

TransformerBlock(
  (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (attn): CausalSelfAttention(
    (c_attn): Conv1d(768, 2304, kernel_size=(1,), stride=(1,))
    (c_proj): Conv1d(768, 768, kernel_size=(1,), stride=(1,))
    (attn_dropout): Dropout(p=0.1, inplace=False)
    (resid_dropout): Dropout(p=0.1, inplace=False)
  )
  (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (mlp): MLP(
    (c_fc): Conv1d(768, 3072, kernel_size=(1,), stride=(1,))
    (c_proj): Conv1d(3072, 768, kernel_size=(1,), stride=(1,))
    (act): GELU(approximate='none')
    (dropout): Dropout(p=0.1, inplace=False)
  )
)

In [6]:
x=torch.randn(1, 128, 768)
y=sample_block(x)

In [None]:
# layer norm 1
after_ln_1=(x-x.mean(dim=-1, keepdim=True))/torch.sqrt(x.var(dim=-1, keepdim=True)+1e-5)*sample_block.ln_1.weight + sample_block.ln_1.bias

# c_attn
after_c_attn = F.conv1d(after_ln_1.permute(0,2,1), sample_block.attn.c_attn.weight,sample_block.attn.c_attn.bias )
q=after_c_attn[:, 0:768,:]
k=after_c_attn[:, 768:768*2, :]
v=after_c_attn[:, 768*2:768*3, :]
batch_size, _, seq_len = q.shape  # q.shape = (1, 768, 128)
num_heads  = 12
head_dim    = 768 // num_heads 
q_reshaped = q.view(batch_size, num_heads, head_dim, seq_len)
k_reshaped = k.view(batch_size, num_heads, head_dim, seq_len)
q_for_matmul = q_reshaped.permute(0, 1, 3, 2)  

# Q @ K.T
attn_scores = torch.matmul(q_for_matmul, k_reshaped)  
attn_scores = attn_scores / math.sqrt(head_dim)
attn_probs = torch.softmax(attn_scores, dim=-1)

v_reshaped = v.view(batch_size, num_heads, head_dim, seq_len).permute(0, 1, 3, 2) 
# score @ V
context_permuted=torch.matmul(attn_probs, v_reshaped)
context_final = context_permuted.contiguous().view(batch_size, seq_len, num_heads*head_dim) #1, 128, 768
context_for_cproj = context_final.permute(0, 2, 1)  # → (1, 768, 128)

#c_proj
out = sample_block.attn.c_proj(context_for_cproj)   # → (1, 768, 128)
out = out.permute(0, 2, 1)                           # → (1, 128, 768)

#layer norm 2
after_ln_2=(out-out.mean(dim=-1, keepdim=True))/torch.sqrt(out.var(dim=-1, keepdim=True)+1e-5)*sample_block.ln_2.weight + sample_block.ln_2.bias

# mlp c_fc
after_c_fc= sample_block.mlp.c_fc(after_ln_2.permute(0, 2, 1))

#GeLU
after_act = sample_block.mlp.act(after_c_fc)

# mlp c_proj
after_c_proj = sample_block.mlp.c_proj(after_act)
after_mlp = after_c_proj.permute(0, 2, 1)

# residual connection
output = after_ln_2 + after_mlp

: 

In [None]:
import torch
import math
import torch.nn.functional as F

sample_block = TransformerBlock(config=config)

batch_size = 1
num_heads  = config.num_attention_heads  # 12
head_dim   = config.hidden_size // num_heads  # 64
hidden_dim = config.hidden_size         # 768

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# KV 캐시: 처음에는 길이가 0인 텐서 (shape=(B, num_heads, head_dim, 0))
k_cache = torch.zeros(batch_size, num_heads, head_dim, 0, device=device)
v_cache = torch.zeros(batch_size, num_heads, head_dim, 0, device=device)

# 10개의 토큰을 하나씩 처리
# x_tensors = [embedding_of_token1, embedding_of_token2, ...]  
# 여기서는 임의의 벡터로 대체
x_tensors = [torch.randn(batch_size, 1, hidden_dim, device=device) for _ in range(10)]

outputs = []  # 매 스텝의 최종 output(b,1,hidden)을 저장하기 위해

for t, x_t in enumerate(x_tensors, start=1):
    # x_t.shape == (batch_size, 1, hidden_dim)
    # ─────────────────────────────────────────────────────────────────────────
    # (A) 1차 LayerNorm → Q/K/V 계산
    # ─────────────────────────────────────────────────────────────────────────
    after_ln_1 = sample_block.ln_1(x_t)            # → (B, 1, 768)
    ln1_for_conv = after_ln_1.permute(0, 2, 1)      # → (B, 768, 1)
    c_attn_out   = sample_block.attn.c_attn(ln1_for_conv)
    # c_attn_out.shape == (B, 3*hidden=2304, 1)

    # Q_new / K_new / V_new 분리
    q_new = c_attn_out[:,    0:hidden_dim,     :]  # → (B, 768, 1)
    k_new = c_attn_out[:,  hidden_dim:2*hidden_dim, :]  # → (B, 768, 1)
    v_new = c_attn_out[:, 2*hidden_dim:3*hidden_dim, :]  # → (B, 768, 1)

    # Head 단위로 reshape: (B, 768, 1) → (B, num_heads, head_dim, 1)
    q_new_reshaped = q_new.view(batch_size, num_heads, head_dim, 1)  # (B,12,64,1)
    k_new_reshaped = k_new.view(batch_size, num_heads, head_dim, 1)  # (B,12,64,1)
    v_new_reshaped = v_new.view(batch_size, num_heads, head_dim, 1)  # (B,12,64,1)

    # ─────────────────────────────────────────────────────────────────────────
    # (B) KV 캐시 업데이트
    # ─────────────────────────────────────────────────────────────────────────
    #   k_cache, v_cache 는 이전 스텝에서 이미 만들어진 (B,12,64,t-1)짜리 상태
    #   새 k_new_reshaped, v_new_reshaped(각각 (B,12,64,1))을 뒤에 붙이기
    k_cache = torch.cat([k_cache, k_new_reshaped], dim=-1)  # → (B,12,64,t)
    v_cache = torch.cat([v_cache, v_new_reshaped], dim=-1)  # → (B,12,64,t)

    # ─────────────────────────────────────────────────────────────────────────
    # (C) Scaled Dot-Product Attention (한 토큰용)
    # ─────────────────────────────────────────────────────────────────────────
    # (C1) Q·Kᵀ → Score
    #   q_new_reshaped: (B,12,64,1) → (B,12,1,64) 으로 permute
    q_for_matmul = q_new_reshaped.permute(0, 1, 3, 2)         # → (B,12,1,64)

    #   K_full = k_cache: (B,12,64,t)
    #   attn_scores = (B,12,1,64) @ (B,12,64,t) → (B,12,1,t)
    attn_scores = torch.matmul(q_for_matmul, k_cache)        # → (B,12,1,t)
    attn_scores = attn_scores / math.sqrt(head_dim)          # scale

    # (C2) Softmax → Attention Weights
    #   (B,12,1,t) 에 대해 마지막 차원 t (과거+현재 토큰 개수) 에 대해 softmax
    attn_probs = torch.softmax(attn_scores, dim=-1)          # → (B,12,1,t)

    # (C3) Attention Weights @ V_full
    #   v_cache: (B,12,64,t) → permute → (B,12,t,64)
    v_for_matmul = v_cache.permute(0, 1, 3, 2)                # → (B,12,t,64)

    #   context = (B,12,1,t) @ (B,12,t,64) → (B,12,1,64)
    context = torch.matmul(attn_probs, v_for_matmul)         # → (B,12,1,64)

    # ─────────────────────────────────────────────────────────────────────────
    # (D) Head 합치고 c_proj → Attention 출력
    # ─────────────────────────────────────────────────────────────────────────
    # (D1) (B,12,1,64) → (B,1,12,64)
    context_permuted = context.permute(0, 2, 1, 3)            # → (B,1,12,64)

    # (D2) (B,1,12,64) → (B,1, 12*64=768)
    context_final = context_permuted.contiguous().view(batch_size, 1, num_heads * head_dim)
    # context_final.shape == (B,1,768)

    # (D3) c_proj 적용
    context_for_cproj = context_final.permute(0, 2, 1)        # → (B,768,1)
    attn_out = sample_block.attn.c_proj(context_for_cproj)   # → (B,768,1)
    attn_out = attn_out.permute(0, 2, 1)                      # → (B,1,768)

    # (D4) Residual + LayerNorm₂
    #    residual_attn = x_t + attn_out  (shape=(B,1,768))
    residual_attn = x_t + attn_out

    # LayerNorm₂: hidden 축(=마지막 차원) 기준으로 정규화
    μ2   = residual_attn.mean(dim=-1, keepdim=True)                            # (B,1,1)
    σ2   = residual_attn.var(dim=-1, keepdim=True, unbiased=False)             # (B,1,1)
    norm = (residual_attn - μ2) / torch.sqrt(σ2 + 1e-5)                         # (B,1,768)
    after_ln_2 = norm * sample_block.ln_2.weight + sample_block.ln_2.bias      # (B,1,768)

    # ─────────────────────────────────────────────────────────────────────────
    # (E) MLP: c_fc → GELU(act) → c_proj → Residual
    # ─────────────────────────────────────────────────────────────────────────
    # (E1) c_fc (hidden→4*hidden)
    mlp_in = after_ln_2.permute(0, 2, 1)                 # → (B,768,1)
    after_c_fc = sample_block.mlp.c_fc(mlp_in)           # → (B,3072,1)

    # (E2) 활성화(GELU)
    after_act_fc = sample_block.mlp.act(after_c_fc)      # → (B,3072,1)

    # (E3) c_proj (4*hidden→hidden)
    after_c_proj = sample_block.mlp.c_proj(after_act_fc) # → (B,768,1)

    # (E4) 다시 (batch, 1, hidden)
    after_mlp = after_c_proj.permute(0, 2, 1)             # → (B,1,768)

    # (E5) MLP Residual 연결 → 최종 출력
    output_t = after_ln_2 + after_mlp                     # → (B,1,768)

    # (F) 저장 및 다음 토큰 준비
    outputs.append(output_t)  # 이 output_t를 다음 레이어나 혹은 최종분류층 등에 사용할 수 있음

    # - 다음 루프(iteration)에서 x_t ← new embedding(예: LM head의 argmax 결과) 형태로 바꿔 넣으면,
    #   키/값 캐시가 갱신된 상태에서 다음 토큰 예측이 이어집니다.


torch.Size([1, 768, 128])


In [None]:
after_c_fc= sample_block.mlp.c_fc(after_ln_2.permute(0, 2, 1))

In [None]:
after_c_proj = sample_block.mlp.c_proj(after_act)

In [60]:
after_mlp = after_c_proj.permute(0, 2, 1)

In [61]:
output = after_ln_2 + after_mlp