# KLUE BERT를 활용한 뉴스 분류 모델 만들기

## 훈련 환경 세팅

In [1]:
!pip install transformers[torch]
# restart the session

Collecting accelerate>=0.20.3 (from transformers[torch])
  Downloading accelerate-0.26.1-py3-none-any.whl (270 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.9/270.9 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.26.1


In [1]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [2]:
import pandas as pd
import numpy as np
import pickle
import re
import tqdm
from google.colab import drive


In [3]:
import torch
# import datasets
import sys

In [4]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [8]:
# Store the tokenizer we want to use
from transformers import AutoTokenizer
MODEL_NAME = "klue/bert-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/289 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/425 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/495k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

## 데이터 불러오기

In [5]:
from google.colab import drive
drive.mount('/content/drive') # 원하는 세부 경로로 변경이 불가능합니다.

Mounted at /content/drive


In [7]:
import pickle
with open("/content/drive/MyDrive/data/ppc_data.pickle","rb") as fr:
    news_df = pickle.load(fr)

In [22]:
print(news_df['category'].value_counts())


1    855
2    658
3    285
4    189
0    162
6     71
5     68
Name: category, dtype: int64


In [11]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(news_df, test_size=0.2, random_state=42, stratify=news_df['category'])


In [12]:
tokenized_train_sentences = tokenizer(
    list(train_data['document']),
    return_tensors="pt",
    padding=True,
    truncation=True,
    add_special_tokens=True,
    )

In [13]:
print(tokenized_train_sentences[0])
print(tokenized_train_sentences[0].tokens)
print(tokenized_train_sentences[0].ids)
print(tokenized_train_sentences[0].attention_mask)

Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
['[CLS]', '아이언맨', '올', '##트', '##먼', '터미', '##네이터', '오픈', '복귀', '개발', '가속', '개발', '빨', '##르', '##다', '인간', '똑똑', '##하', '##다', '시대', '기업', '지배', '##구조', '상업', '주의', '천국', '계단', '지옥', '##문', '[UNK]', '세상', '놀라', '##다', '오픈', '멀', '##다', '가운데', '사진', '이사회', '로부터', '해고', '당하', '##다', '닷새', '극', '##적', '복귀', '올', '##트', '##먼', '재', '##등', '##판', '인간', '똑똑', '##하', '##다', '일반인', '지능', '개발', '가속', '되', '##다', '것', '##임', '과학', '##계', '지배', '전망', '가', '##다', '아이언맨', '돕', '##다', '착하', '##다', '자비', '##스', '되', '##다', '아니', '##다', '인류', '말살', '나쁘', '##다', '터미', '##네이터', '되', '##다', '최대', '화두', '급', '##부', '있', '##다', '오픈', '늘', '##다', '자사', '공식', '트위터', '계정', '올', '##트', '##먼', '돌아오', '##다', '합의', '밝히', '##다', '따르', '##다', '일리', '##야', '수', '##츠', '##케', '##버', '오픈', '과학자', '지난', '주도', '##하', '##다', '쿠데타', '천하', '내리', '##다', '돼', '##다', '진영', '갈등', '개발', '속도', '둘러싸', '##다', '견해

In [14]:
tokenized_test_sentences = tokenizer(
    list(test_data['document']),
    return_tensors="pt",
    padding=True,
    truncation=True,
    add_special_tokens=True,
    )

In [15]:
train_label = train_data['category'].values
test_label = test_data['category'].values

In [16]:
print(train_label[0])

0


In [17]:
class SingleSentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


In [18]:
train_dataset = SingleSentDataset(tokenized_train_sentences, train_label)
test_dataset = SingleSentDataset(tokenized_test_sentences, test_label)

In [19]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
# 문장 분류를 위해선 BERT 위에 classification을 위한 head를 부착해야 합니다.
# 해당 부분을 transformers에서는 라이브러리 하나만 호출하면 됩니다! :-)

In [20]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=5,              # total number of training epochs
    per_device_train_batch_size=4,  # batch size per device during training
    per_device_eval_batch_size=8,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=500,
    save_steps=500,
    save_total_limit=2
)

In [23]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=7)
model.to(device, dtype=torch.float)
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
)

model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at klue/bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
trainer.train() # 1 epoch에 대략 30분 정도 소요됩니다 :-)

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Step,Training Loss
500,1.0697
1000,0.7689
1500,0.557
2000,0.3443


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


TrainOutput(global_step=2290, training_loss=0.6368952188949918, metrics={'train_runtime': 1100.3777, 'train_samples_per_second': 8.315, 'train_steps_per_second': 2.081, 'total_flos': 2407574234880000.0, 'train_loss': 0.6368952188949918, 'epoch': 5.0})

## 모델 평가

In [25]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [26]:
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics
)

In [27]:
trainer.evaluate(eval_dataset=test_dataset)

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'eval_loss': 1.043134093284607,
 'eval_accuracy': 0.8122270742358079,
 'eval_f1': 0.8077971843165093,
 'eval_precision': 0.8095655415493219,
 'eval_recall': 0.8122270742358079,
 'eval_runtime': 15.0947,
 'eval_samples_per_second': 30.342,
 'eval_steps_per_second': 3.842}

In [None]:
# native training using torch

# model = BertForSequenceClassification.from_pretrained(MODEL_NAME)
# model.to(device)
# model.train()

# train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# optim = AdamW(model.parameters(), lr=5e-5)

# for epoch in range(3):
#     for batch in train_loader:
#         optim.zero_grad()
#         input_ids = batch['input_ids'].to(device)
#         attention_mask = batch['attention_mask'].to(device)
#         labels = batch['labels'].to(device)
#         outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
#         loss = outputs[0]
#         loss.backward()
#         optim.step()

In [28]:
# predict함수
def sentences_predict(sent):
    model.eval()
    tokenized_sent = tokenizer(
            sent,
            return_tensors="pt",
            truncation=True,
            add_special_tokens=True,
            max_length=128
    )
    tokenized_sent.to(device)

    with torch.no_grad():# 그라디엔트 계산 비활성화
        outputs = model(
            input_ids=tokenized_sent['input_ids'],
            attention_mask=tokenized_sent['attention_mask'],
            token_type_ids=tokenized_sent['token_type_ids']
            )

    logits = outputs[0]
    logits = logits.detach().cpu().numpy()
    result = np.argmax(logits)
    return result

In [32]:
print(sentences_predict("후티는 같은 반미·반이스라엘 세력인 하마스를 돕겠다며 지난해 11월 이스라엘과 연관된 선박을 공격하겠다고 선언했다. 홍해와 수에즈운하, 인도양을 잇는 요충지인 아덴만을 장악한 후티가 이스라엘과 무관한 선박까지 공격하면서 이 지역 해운은 마비된 상태다. 후티가 막아서기 전까지만 해도 이 일대는 전 세계 물동량의 15%가 지나가는 핵심 항로였다."))
print(sentences_predict("지난 몇 년간 국내에서 투자가 가능했던 캐나다·독일 비트코인 현물 상장지수펀드(ETF) 거래가 돌연 중단됐다. 금융당국이 미국 증시에 상장한 비트코인 현물 ETF 거래를 금지한 여파로 분석된다. 금융당국의 일관성 없는 규제로 국내 증권사와 투자자의 혼란이 커지고 있다는 지적이 나온다."))
print(sentences_predict("민주당혁신행동은 이날 언론에 배포한 “임종석, 노영민 두 전직 비서실장은 윤석열을 발탁한 진실부터 밝히고 출마하라”라는 글에서 “정권 교체의 계기를 제공하고 윤석열 정권 탄생에 기여한 인사들이 총선에 연이어 출마하는 황당한 일이 이어지고 있다. 임종석, 노영민 두 비서실장이 대표 사례다”라고 했다."))


4
1
6


끗!