In [1]:
from datasets import load_dataset, load_metric
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, AdamW, get_scheduler

import torch
from torch.utils.data import DataLoader

from tqdm.auto import tqdm

# 데이터셋 적재
raw_datasets = load_dataset("glue", "mrpc")

# 사전학습 언어모델 체크포인트 이름 지정 
checkpoint = "bert-base-uncased"

# 지정된 사전학습 언어모델에서 토크나이저 인스턴스화
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# 토크나이저 함수 사용자 정의화(sentence1, sentence2 칼럼 토크나이징 수행)
def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

# 토크나이징 수행
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

# 배치 별 패딩을 위한 data collator 정의
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 불필요한 입력 컬럼 제거, 사전학습 언어모델에 필요한 입력만 남김.
tokenized_datasets = tokenized_datasets.remove_columns(["sentence1", "sentence2", "idx"])
# 데이터셋의 label 컬럼명을 labels으로 변경
tokenized_datasets = tokenized_datasets.rename_column("label","labels")
# 데이터셋의 유형을 Pytorch tensor로 변경
tokenized_datasets.set_format("torch")

# 변경된 컬럼 출력
print(tokenized_datasets["train"].column_names)

# 각 종류 별 데이터 로더 생성
train_dataloader = DataLoader(tokenized_datasets["train"],
                              shuffle=True,
                              batch_size=8,
                              collate_fn=data_collator)
eval_dataloader = DataLoader(tokenized_datasets["validation"],
                             shuffle=True,
                             batch_size=8,
                             collate_fn=data_collator)

# 사전학습 언어모델 인스턴스화
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
# 최적화 함수 정의
optimizer = AdamW(model.parameters(), lr=5e-5)

# 에폭 수 설정
num_epochs = 3
# 학습 스템 수 계산
num_training_steps = num_epochs * len(train_dataloader)
# 학습 스케줄러 설정
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0,
                             num_training_steps=num_training_steps)

# GPU로 모델 이동
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

# 진행 상황바 정의
progress_bar = tqdm(range(num_training_steps))

# 모델을 학습 모드로 전환
model.train()
# 학습 루프 시작
for epoch in range(num_epochs):
    for batch in train_dataloader:
        # 현재 배치 중에서 입력값을 모두 GPU로 이동.
        batch = {k: v.to(device) for k, v in batch.items()}
        # 모델 실행
        outputs = model(**batch)
        # 손실값 가져오기
        loss = outputs.loss
        # 역전파 수행
        loss.backward()
        
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
        
# 평가 메트릭 가져오기
metric = load_metric("glue", "mrpc")
# 모델을 평가모드로 전환
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
        
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])
    
# 평가 결과 계산 및 출력
metric.compute()

  from .autonotebook import tqdm as notebook_tqdm
Found cached dataset glue (/home/coronarita1991/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
100%|██████████| 3/3 [00:00<00:00, 415.14it/s]
Loading cached processed dataset at /home/coronarita1991/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-e79d948b262b7228.arrow
100%|██████████| 1/1 [00:00<00:00, 16.71ba/s]
Loading cached processed dataset at /home/coronarita1991/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-b830c95c0e4f9816.arrow


['labels', 'input_ids', 'token_type_ids', 'attention_mask']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

RuntimeError: CUDA error: unknown error
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.