In [1]:
# MRPC(Microsoft Research Paraphrase Corpus) 데이터셋 각 문장 쌍은 의미적으로 유사한지 아닌에 대한 레이블(1:유사,0:비유사)
# 문장 유사도, 의미파악, 텍스트 중복 제거와 같은 문제를 다룬다
# 레이블(1:유사,0:비유사) 영어 문장 유사도 카테고리를 분류하는 텍스트 분류 및 추론, pre-trained model(bert-base-uncased) 적용

In [2]:
# 1️⃣ 데이터셋 불러오기
from datasets import load_dataset

mrpc_dataset = load_dataset("glue", "mrpc")
train_dataset = mrpc_dataset["train"]
valid_dataset = mrpc_dataset["validation"]
test_dataset = mrpc_dataset["test"]

print(train_dataset)
print(train_dataset[0])

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 3668
})
{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .', 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .', 'label': 1, 'idx': 0}


In [3]:
# 2️⃣ 모델과 토크나이저 준비
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm.auto import tqdm
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_id = "bert-base-uncased"  # 영어용 BERT
tokenizer = AutoTokenizer.from_pretrained(model_id) # AutoTokenizer 
model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=2) # pre-tained model(bert-base-uncased), 레이블(1:유사,0:비유사) 분류
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
# 3️⃣ 토큰화 함수 정의
def tokenize_function(examples):
    return tokenizer(
        examples["sentence1"],
        examples["sentence2"],
        padding="max_length",
        truncation=True,
        max_length=128
    )

In [58]:
# 4️⃣ 데이터로더 생성 함수
def make_dataloader(dataset, batch_size, shuffle=True):
    dataset = dataset.map(tokenize_function, batched=True).with_format("torch")
    dataset = dataset.rename_column("label", "labels")
    dataset = dataset.remove_columns(["sentence1", "sentence2", "idx"])
    
    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)

In [59]:
# 5️⃣ 데이터로더 생성
train_dataloader = make_dataloader(train_dataset, batch_size=8, shuffle=True)
valid_dataloader = make_dataloader(valid_dataset, batch_size=8, shuffle=False)
test_dataloader = make_dataloader(test_dataset, batch_size=8, shuffle=False)

In [7]:
# 6️⃣ 학습 함수
def train_epoch(model, data_loader, optimizer):
    model.train()
    total_loss = 0
    for batch in tqdm(data_loader):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(data_loader)

In [8]:
# 7️⃣ 평가 함수
def evaluate(model, data_loader):
    model.eval()
    total_loss = 0
    predictions = []
    true_labels = []
    with torch.no_grad():
        for batch in tqdm(data_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            logits = outputs.logits
            loss = outputs.loss
            total_loss += loss.item()
            preds = torch.argmax(logits, dim=-1)
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    avg_loss = total_loss / len(data_loader)
    accuracy = np.mean(np.asarray(predictions) == np.asarray(true_labels))
    return avg_loss, accuracy

In [9]:
# 8️⃣ 학습 수행
num_epochs = 5
optimizer = AdamW(model.parameters(), lr=5e-5)

for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")
    train_loss = train_epoch(model, train_dataloader, optimizer)
    print(f"Training loss: {train_loss}")
    valid_loss, valid_accuracy = evaluate(model, valid_dataloader)
    print(f"Validation loss: {valid_loss}")
    print(f"Validation accuracy: {valid_accuracy}")

Epoch 1/5


  0%|          | 0/459 [00:00<?, ?it/s]

Training loss: 0.5564437511180221


  0%|          | 0/51 [00:00<?, ?it/s]

Validation loss: 0.5327686048021504
Validation accuracy: 0.7205882352941176
Epoch 2/5


  0%|          | 0/459 [00:00<?, ?it/s]

Training loss: 0.39180360020245864


  0%|          | 0/51 [00:00<?, ?it/s]

Validation loss: 0.39861923982115355
Validation accuracy: 0.8308823529411765
Epoch 3/5


  0%|          | 0/459 [00:00<?, ?it/s]

Training loss: 0.261898044200537


  0%|          | 0/51 [00:00<?, ?it/s]

Validation loss: 0.4605114497792195
Validation accuracy: 0.8284313725490197
Epoch 4/5


  0%|          | 0/459 [00:00<?, ?it/s]

Training loss: 0.14635189606002602


  0%|          | 0/51 [00:00<?, ?it/s]

Validation loss: 0.6117820032488774
Validation accuracy: 0.8259803921568627
Epoch 5/5


  0%|          | 0/459 [00:00<?, ?it/s]

Training loss: 0.127768576315103


  0%|          | 0/51 [00:00<?, ?it/s]

Validation loss: 0.5285419749424738
Validation accuracy: 0.8186274509803921


In [10]:
# 9️⃣ 테스트 평가
_, test_accuracy = evaluate(model, test_dataloader)
print(f"Test accuracy: {test_accuracy}")

  0%|          | 0/216 [00:00<?, ?it/s]

Test accuracy: 0.8214492753623188
