In [51]:
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import numpy as np
from torch.utils.data import Dataset , DataLoader
import pandas as pd

import os

from transformers import BertTokenizer

from transformers import default_data_collator

# **seed 고정**

In [2]:
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

GPU

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


# **Hugging face dateset가져오기**

In [4]:
# Hugging Face datasets 라이브러리 설치 (설치되지 않은 경우)
!pip install datasets

# GLUE 데이터셋 불러오기
from datasets import load_dataset

# MRPC (Microsoft Research Paraphrase Corpus) 태스크 로드 예시
dataset = load_dataset("glue", "mrpc")

Collecting datasets
  Downloading datasets-3.0.2-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.2-py3-none-any.whl (472 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.7/472.7 kB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading 

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/649k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/75.7k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/308k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3668 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/408 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

In [5]:
# 데이터셋 정보 출력
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})


sentence1 , sentence2의 동의어 관계 판별 ,

---
label : 1 -> 동의어 관계
label : 0 -> 동의어 관계 아님

---
idx : 고유 id



In [6]:
# 훈련, 검증 데이터 예시 출력
print(f"Train Example: {dataset['train'][0]}")
print(f"Validation Example: {dataset['validation'][0]}")

Train Example: {'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .', 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .', 'label': 1, 'idx': 0}
Validation Example: {'sentence1': "He said the foodservice pie business doesn 't fit the company 's long-term growth strategy .", 'sentence2': '" The foodservice pie business does not fit our long-term growth strategy .', 'label': 1, 'idx': 9}


# **토큰화**

In [7]:
# 2. BERT 토크나이저 불러오기 (사전 학습된 BERT 모델 사용)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



In [8]:
# 3. 토큰화 함수 정의
def tokenize_function(examples):
    # sentence1과 sentence2를 함께 토큰화 (Padding, Truncation 처리)
    return tokenizer(examples["sentence1"],
                     examples["sentence2"],
                     padding="max_length",
                     truncation=True,
                     max_length=16)

In [9]:
# 4. 데이터셋에 토큰화 적용
tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

In [10]:
# 5. 토큰화된 데이터 예시 출력
print(tokenized_dataset["train"][0])

{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .', 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .', 'label': 1, 'idx': 0, 'input_ids': [101, 2572, 3217, 5831, 5496, 2010, 2567, 1010, 3183, 2002, 2170, 1000, 1996, 7409, 1000, 1010, 1997, 9969, 4487, 23809, 3436, 2010, 3350, 1012, 102, 7727, 2000, 2032, 2004, 2069, 1000, 1996, 7409, 1000, 1010, 2572, 3217, 5831, 5496, 2010, 2567, 1997, 9969, 4487, 23809, 3436, 2010, 3350, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 

구조 :

---

(sentence1 원문 , sentence2 원문 , label , idx , input_ids1 & 2 , token_type_ids , attention_mask)

---
token_type_ids : 두 문장(문장1, 문장2)을 구분하기 위한 인코딩 (BERT에서 사용),

---

attention_mask : 패딩된 부분을 무시하도록 마스킹 처리 (1은 유효한 토큰, 0은 패딩)


# **배치화**

In [52]:
# 데이터셋을 DataLoader에 넣기
train_loader = train_loader = DataLoader(tokenized_dataset["train"], batch_size=16, collate_fn=default_data_collator)

In [53]:
# 3. 배치 반복 및 크기 확인
for batch in train_loader:
    print(batch.keys())  # 딕셔너리의 키 확인 (예: input_ids, attention_mask, label)

    input_ids = batch['input_ids']
    print(f"Input IDs shape: {len(input_ids)}")
    print(f"Input IDs shape: {len(input_ids[0])}")

    attention_mask = batch['attention_mask']
    print(f"Attention Mask shape: {len(attention_mask)}")
    print(f"Attention Mask shape: {len(attention_mask[0])}")
    break  # 첫 배치만 확인

dict_keys(['labels', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'])
Input IDs shape: 16
Input IDs shape: 128
Attention Mask shape: 16
Attention Mask shape: 128


# **특정 배치에서 sequence_length가 4로 나머지 16과 불일치**

# **필드 추출**

In [None]:
# 5. DataLoader에서 배치 추출 및 Encoder에 입력 예시
for batch in train_loader:
    # 필요한 텐서 추출
    input_ids = batch['input_ids']
    # 어디 문장 소속인지 여부를 표시하는 텐서 추가
    token_type_ids = batch['token_type_ids']
    attention_mask = batch['attention_mask']
    label = batch['labels']

    print(f"Batch size: {len(input_ids)}, Sequence length: {len(input_ids[0])}")
    print(f"label : {len(label)}")

# **데이터 처리**

In [41]:
# batch는 dictionary 형태이므로 필요한 필드만 따로 추출해야한다
# token embeddings + segment embeddings + positional embeddings

# **Encoder**

In [56]:
class MultiHeadAttention(nn.Module):
  def __init__(self, input_size = 16 , embedding_dim = 16*8 , num_heads = 8):
    super().__init__()
    self.embedding_dim = embedding_dim
    self.num_heads = num_heads
    self.head_dim = embedding_dim // num_heads
    self.input_size = input_size

    self.q = nn.Linear(embedding_dim , embedding_dim)
    self.k = nn.Linear(embedding_dim , embedding_dim)
    self.v = nn.Linear(embedding_dim , embedding_dim)

    self.fc = nn.Linear(embedding_dim , embedding_dim)

  def go(self , x):

    batch_size = x.shape[0]

    q = self.q(x) # batch_num , sequence_length , embed_size : 16 , 16 , 128
    k = self.k(x)
    v = self.v(x)

    q = q.view(batch_size , -1 , self.num_heads , self.head_dim).transpose(1,2) # batch , num_head, 16 , 16*8//8 = 16
    k = k.view(batch_size , -1 , self.num_heads , self.head_dim).transpose(1,2)
    v = v.view(batch_size , -1 , self.num_heads , self.head_dim).transpose(1,2)

    # Scaled Dot-Product Attention
    attention_score = (q @ k.transpose(3,2)) / (self.head_dim ** 0.5)
    attention_score = torch.softmax(attention_score, dim=-1)
    attention = torch.matmul(attention_score , v)
    attention = attention.transpose(1,2).contiguous().view(batch_size , -1 , self.embedding_dim) # batch , 8 , sequence_length , 16*8//8 = 16
    attention = attention.reshape(x.shape[0] , x.shape[1] , x.shape[2])
    output = self.fc(attention)

    return output # batch , sequence_length , 16*8

# **인풋의 길이가 전부 16으로 일정하므로 padding을 추가하지 않는다**

In [69]:
class Pre_process(nn.Module):
    def __init__(self , embed_size = 16*8 , num_heads = 8, vocab_size = 40000 , batch_num = 16 , max_length = 16, dropout=0.1):
        super().__init__()
        self.embed_size = embed_size
        self.max_length = max_length
        self.word_embedding = nn.Embedding(vocab_size, embed_size)
        self.position_embedding = nn.Parameter(torch.randn(1 , max_length , embed_size))
        self.dropout = nn.Dropout(dropout)
        self.batch_num = batch_num
    def run(self , input , token_type_ids , mask = None):
        batch, seq_length = input.shape
        z = self.word_embedding(input) # z.shape : batch_num , sequence_length , embedding_dim = 16 , 16 , 128

        positional_embed = self.position_embedding.expand(self.batch_num , self.max_length , self.embed_size)

        print(z.shape)
        print(positional_embed.shape)
        print(token_type_ids.unsqueeze(2).expand(self.batch_num,self.max_length,self.embed_size).shape)

        # 단어 임베딩 + position + 문장 소속(binary)
        out = z + positional_embed + token_type_ids.unsqueeze(2).expand(self.batch_num,self.max_length,self.embed_size)
        out = self.dropout(out)
        return out # out.shape : batch_num , sequence_length , embed_size = 16 , 16 , 128

x -> Encoder -> y -> multiheadattention -> z -> Encoder_block

# **Encoder block**

In [70]:
class Encoder_Block(nn.Module):
  def __init__(self , embed_size = 128 , num_heads = 8 , dropout = 0.1):
    super().__init__()

    self.relu = nn.ReLU()

    self.norm1 = nn.LayerNorm(embed_size)
    self.norm2 = nn.LayerNorm(1)

    self.l1 = nn.Linear(embed_size , embed_size)
    self.l2 = nn.Linear(embed_size , 1)
    self.l3 = nn.Linear(16 , 1)

    self.pre_process = Pre_process()
    self.attention = MultiHeadAttention()

    self.dropout = nn.Dropout(0.1)

  def update(self , input , token_type_ids):

    c = self.pre_process.run(input , token_type_ids)
    z = self.attention.go(c)

    z = z + c
    # print(z.shape) , [16, 16, 128]
    z1 = self.norm1(z)

    z = self.l1(z1)
    z = self.relu(z)

    #skip
    z = z1 + z  # z.shape : 16 ,16 , 128

    z = self.l2(z) # 16 ,16 , 1
    z = self.dropout(z)
    z = z.reshape(16 , 16)
    z = self.l3(z) # 16 , 1

    z = self.norm2(z) # 16 , 1

    return z # 16 , 1

Model 설계

In [71]:
attention_model = MultiHeadAttention()
preprocess_model = Pre_process()
model = Encoder_Block()

# **아키텍쳐**

In [72]:
#model = model.to(device)
optimizer =  torch.optim.Adam(model.parameters(), lr=0.001)

In [73]:
cnt = 0
loss_history = []
test_loss_history = []
ce_loss = nn.CrossEntropyLoss()

epochs = 4

model.train()
for epoch in range(epochs):
  for batch in train_loader:

    # 필요한 텐서 추출
    input_ids = batch['input_ids']
    #input_ids = torch.stack(input_ids)

    # 어디 문장 소속인지 여부를 표시하는 텐서 추가
    token_type_ids = batch['token_type_ids']
    #token_type_ids = torch.stack(token_type_ids)

    #label
    labels = batch['labels']

    #print(model.device)
    output = model.update(input_ids , token_type_ids)
    loss = ce_loss(output, labels)

    # Backward and optimize
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # loss history append
    loss_history.append(loss.item())

  #scheduler_linear.step()
  print("epoch : {} , loss : {}".format(epoch, loss))

torch.Size([16, 128, 128])
torch.Size([16, 16, 128])


RuntimeError: The expanded size of the tensor (16) must match the existing size (128) at non-singleton dimension 1.  Target sizes: [16, 16, 128].  Tensor sizes: [16, 128, 1]

In [22]:
data_counts = {split: len(dataset[split]) for split in dataset}
print(data_counts)

{'train': 3668, 'validation': 408, 'test': 1725}


In [24]:
data_counts = {split: dataset[split][0] for split in dataset}
print(data_counts)

{'train': {'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .', 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .', 'label': 1, 'idx': 0}, 'validation': {'sentence1': "He said the foodservice pie business doesn 't fit the company 's long-term growth strategy .", 'sentence2': '" The foodservice pie business does not fit our long-term growth strategy .', 'label': 1, 'idx': 9}, 'test': {'sentence1': "PCCW 's chief operating officer , Mike Butcher , and Alex Arena , the chief financial officer , will report directly to Mr So .", 'sentence2': 'Current Chief Operating Officer Mike Butcher and Group Chief Financial Officer Alex Arena will report to So .', 'label': 1, 'idx': 0}}


In [75]:
for batch in train_loader:
  print(batch['labels'].shape)
  break

torch.Size([16])
