In [48]:
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import numpy as np
from torch.utils.data import Dataset , DataLoader
import pandas as pd

import os

from transformers import BertTokenizer

# **seed 고정**

In [49]:
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

GPU

In [50]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


# **Hugging face dateset가져오기**

In [51]:
# Hugging Face datasets 라이브러리 설치 (설치되지 않은 경우)
!pip install datasets

# GLUE 데이터셋 불러오기
from datasets import load_dataset

# MRPC (Microsoft Research Paraphrase Corpus) 태스크 로드 예시
dataset = load_dataset("glue", "mrpc")



In [52]:
# 데이터셋 정보 출력
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})


sentence1 , sentence2의 동의어 관계 판별 ,

---
label : 1 -> 동의어 관계
label : 0 -> 동의어 관계 아님

---
idx : 고유 id



In [53]:
# 훈련, 검증 데이터 예시 출력
print(f"Train Example: {dataset['train'][0]}")
print(f"Validation Example: {dataset['validation'][0]}")

Train Example: {'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .', 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .', 'label': 1, 'idx': 0}
Validation Example: {'sentence1': "He said the foodservice pie business doesn 't fit the company 's long-term growth strategy .", 'sentence2': '" The foodservice pie business does not fit our long-term growth strategy .', 'label': 1, 'idx': 9}


# **토큰화**

In [54]:
# 2. BERT 토크나이저 불러오기 (사전 학습된 BERT 모델 사용)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")



In [55]:
# 3. 토큰화 함수 정의
def tokenize_function(examples):
    # sentence1과 sentence2를 함께 토큰화 (Padding, Truncation 처리)
    return tokenizer(examples["sentence1"],
                     examples["sentence2"],
                     padding="max_length",
                     truncation=True,
                     max_length=128)

In [56]:
# 4. 데이터셋에 토큰화 적용
tokenized_dataset = dataset.map(tokenize_function, batched=True)

In [57]:
# 5. 토큰화된 데이터 예시 출력
print(tokenized_dataset["train"][0])

{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .', 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .', 'label': 1, 'idx': 0, 'input_ids': [101, 2572, 3217, 5831, 5496, 2010, 2567, 1010, 3183, 2002, 2170, 1000, 1996, 7409, 1000, 1010, 1997, 9969, 4487, 23809, 3436, 2010, 3350, 1012, 102, 7727, 2000, 2032, 2004, 2069, 1000, 1996, 7409, 1000, 1010, 2572, 3217, 5831, 5496, 2010, 2567, 1997, 9969, 4487, 23809, 3436, 2010, 3350, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 

구조 :

---

(sentence1 원문 , sentence2 원문 , label , idx , input_ids1 & 2 , token_type_ids , attention_mask)

---
token_type_ids : 두 문장(문장1, 문장2)을 구분하기 위한 인코딩 (BERT에서 사용),

---

attention_mask : 패딩된 부분을 무시하도록 마스킹 처리 (1은 유효한 토큰, 0은 패딩)


# **배치화**

In [58]:
# 데이터셋을 DataLoader에 넣기
train_loader = DataLoader(tokenized_dataset['train'], batch_size=16, shuffle=True , drop_last=True)

In [59]:
# 3. 배치 반복 및 크기 확인
for batch in train_loader:
    print(batch.keys())  # 딕셔너리의 키 확인 (예: input_ids, attention_mask, label)

    input_ids = batch['input_ids']
    print(f"Input IDs shape: {len(input_ids)}")
    print(f"Input IDs shape: {len(input_ids[0])}")

    attention_mask = batch['attention_mask']
    print(f"Attention Mask shape: {len(attention_mask)}")
    print(f"Attention Mask shape: {len(attention_mask[0])}")
    break  # 첫 배치만 확인

dict_keys(['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'])
Input IDs shape: 128
Input IDs shape: 16
Attention Mask shape: 128
Attention Mask shape: 16


# **특정 배치에서 sequence_length가 4로 나머지 16과 불일치**

# **필드 추출**

In [60]:
# 5. DataLoader에서 배치 추출 및 Encoder에 입력 예시
for batch in train_loader:
    # 필요한 텐서 추출
    input_ids = batch['input_ids']
    # 어디 문장 소속인지 여부를 표시하는 텐서 추가
    token_type_ids = batch['token_type_ids']
    attention_mask = batch['attention_mask']

    print(f"Batch size: {len(input_ids)}, Sequence length: {len(input_ids[0])}")

Batch size: 128, Sequence length: 16
Batch size: 128, Sequence length: 16
Batch size: 128, Sequence length: 16
Batch size: 128, Sequence length: 16
Batch size: 128, Sequence length: 16
Batch size: 128, Sequence length: 16
Batch size: 128, Sequence length: 16
Batch size: 128, Sequence length: 16
Batch size: 128, Sequence length: 16
Batch size: 128, Sequence length: 16
Batch size: 128, Sequence length: 16
Batch size: 128, Sequence length: 16
Batch size: 128, Sequence length: 16
Batch size: 128, Sequence length: 16
Batch size: 128, Sequence length: 16
Batch size: 128, Sequence length: 16
Batch size: 128, Sequence length: 16
Batch size: 128, Sequence length: 16
Batch size: 128, Sequence length: 16
Batch size: 128, Sequence length: 16
Batch size: 128, Sequence length: 16
Batch size: 128, Sequence length: 16
Batch size: 128, Sequence length: 16
Batch size: 128, Sequence length: 16
Batch size: 128, Sequence length: 16
Batch size: 128, Sequence length: 16
Batch size: 128, Sequence length: 16
B

# **데이터 처리**

In [61]:
# batch는 dictionary 형태이므로 필요한 필드만 따로 추출해야한다
# token embeddings + segment embeddings + positional embeddings

# **Encoder**

In [62]:
class MultiHeadAttention(nn.Module):
  def __init__(self, input_size = 8 , embedding_dim = 128 , num_heads = 8):
    super().__init__()
    self.embedding_dim = embedding_dim
    self.num_heads = num_heads
    self.head_dim = embedding_dim // num_heads
    self.input_size = input_size

    self.q = nn.Linear(input_size , embedding_dim)
    self.k = nn.Linear(input_size , embedding_dim)
    self.v = nn.Linear(input_size , embedding_dim)

    self.fc = nn.Linear(self.head_dim , embedding_dim)

  def go(self , x):
    batch_size = x.size(0)

    q = self.q(x)
    k = self.k(x)
    v = self.v(x)

    q = q.view(batch_size , -1 , self.num_heads , self.head_dim).transpose(1,2) # batch , sequence_length , 8 , 128//8 = 16
    k = k.view(batch_size , -1 , self.num_heads , self.head_dim).transpose(1,2)
    v = v.view(batch_size , -1 , self.num_heads , self.head_dim).transpose(1,2)

    # Scaled Dot-Product Attention
    attention_score = torch.matmul(q, k.transpose(-2, -1)) / (self.head_dim ** 0.5)
    attention_score = torch.softmax(attention_score, dim=-1)
    attention = torch.matmul(attention_score , v)
    attention = attention.transpose(1,2).contiguous().view(batch_size , -1 , self.embedding_dim) # batch , 8 , sequence_length , 128//8 = 16
    output = self.fc(attention)

    return output # batch , 8 , sequence_length , 512 // 8 = 64

# **인풋의 길이가 전부 8로 일정하므로 padding을 추가하지 않는다**

In [124]:
class Pre_process(nn.Module):
    def __init__(self , embed_size = 16, num_heads = 8, vocab_size = 40000 , batch_num = 128 , max_length = 16, dropout=0.1):
        super().__init__()
        self.embed_size = embed_size

        # Embedding layer
        self.word_embedding = nn.Embedding(vocab_size, embed_size)
        self.position_embedding = nn.Parameter(torch.zeros(batch_num , embed_size))

        self.dropout = nn.Dropout(dropout)

    def run(self, input , tocken_type_ids , mask = None):
        batch_num, seq_length = input.shape
        print(0)
        out = self.word_embedding(input) + self.position_embedding + token_type_ids  # 단어 임베딩 + position + 문장 소속(binary)
        print(1)
        out = self.dropout(out)
        return out # out.shape : batch_num , embed_size = 128 , 16

x -> Encoder -> y -> multiheadattention -> z -> Encoder_block

# **Encoder block**

In [125]:
class Encoder(nn.Module):
  def __init__(self , embed_size = 128 , num_heads = 8 , dropout = 0.1):
    super().__init__()

    self.norm1 = nn.LayerNorm(embed_size)
    self.norm2 = nn.LayerNorm(embed_size)

    self.l1 = nn.Linear(embed_size , embed_size)
    self.l2 = nn.Linear(embed_size , 1)

  def update(self , x , input):
    #skip connection
    #transformer말고 bert식으로 먼저 더하고 layernorm취한다
    x = x + input
    x = self.norm1(x)

    z = self.l1(x)

    x = x + z
    x = self.l2(x)
    x = self.norm2(x)
    return x

Model 설계

In [126]:
attention_model = MultiHeadAttention()
preprocess_model = Pre_process()
encoder = Encoder()

In [127]:
class Model(nn.Module):
  def __init__(self , attention_model , preprocess_model , encoder):
    super().__init__()
    self.attention_model = attention_model
    self.preprocess_model = preprocess_model
    self.encoder = encoder

  def forward(self , input , token_type_ids):
    x_1 = self.preprocess_model.run(input , token_type_ids)
    x_2 = self.attention_model.go(x_1)
    x_3 = self.encoder.update(x_2 , x_1)
    return x_3

# **아키텍쳐**

In [128]:
model = Model(attention_model , preprocess_model , encoder)
optimizer =  torch.optim.Adam(model.parameters(), lr=0.001)

In [129]:
cnt = 0
loss_history = []
test_loss_history = []
ce_loss = nn.CrossEntropyLoss()

epochs = 4

model.train()
for epoch in range(epochs):
  for batch in train_loader:

    # 필요한 텐서 추출
    input_ids = batch['input_ids']
    input_ids = torch.stack(input_ids)

    # 어디 문장 소속인지 여부를 표시하는 텐서 추가
    token_type_ids = batch['token_type_ids']
    token_type_ids = torch.stack(token_type_ids)

    #label
    labels = batch['label']

    #print(model.device)
    output = model.forward(input_ids , token_type_ids)
    loss = ce_loss(output, labels)

    # Backward and optimize
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # loss history append
    loss_history.append(loss.item())

  #scheduler_linear.step()
  print("epoch : {} , loss : {}".format(epoch, loss))

0
torch.Size([128, 16, 16])
torch.Size([128, 16])
torch.Size([128, 16])


RuntimeError: The size of tensor a (16) must match the size of tensor b (128) at non-singleton dimension 1