In [1]:
import os
import sys
from pathlib import Path

from chrisbase.io import get_current_path
from chrisbase.util import to_dataframe
from chrisdict import AttrDict

env = AttrDict()
env["python_path"] = Path(sys.executable)
env["project_path"] = [x for x in get_current_path().parents if x.name.startswith("DeepKorean")][0]
env["current_path"] = get_current_path().relative_to(env.project_path)
os.chdir(env.project_path)
print(to_dataframe(env, columns=["key", "value"]))

            key                                            value
0   python_path  /dat/anaconda3/envs/DeepKorean-23.03/bin/python
1  project_path                       /dat/proj/DeepKorean-23.03
2  current_path                      tests/1-doc_cls-train.ipynb


In [2]:
################################################################################
# 코드 4-4: 모델 환경 설정
################################################################################
from ratsnlp.nlpbook.classification import ClassificationTrainArguments

args = ClassificationTrainArguments(
    pretrained_model_name="pretrained/KcBERT-Base",
    downstream_corpus_name="nsmc",
    downstream_corpus_root_dir="data",
    downstream_model_dir="checkpoints/nsmc",
    batch_size=32,
    learning_rate=5e-5,
    max_seq_length=128,
    epochs=3,
    tpu_cores=0,
    seed=7,
    # overwrite_cache=True,
)
args

ClassificationTrainArguments(pretrained_model_name='pretrained/KcBERT-Base', downstream_task_name='document-classification', downstream_corpus_name='nsmc', downstream_corpus_root_dir='data', downstream_model_dir='checkpoints/nsmc', max_seq_length=128, save_top_k=1, monitor='min val_loss', seed=7, overwrite_cache=False, force_download=False, test_mode=False, learning_rate=5e-05, epochs=3, batch_size=32, cpu_workers=32, fp16=False, tpu_cores=0)

In [3]:
################################################################################
# 코드 4-5: 랜덤 시드 고정
################################################################################
from ratsnlp import nlpbook

nlpbook.set_seed(args)

set seed: 7


In [4]:
################################################################################
# 코드 4-6: 로거 설정
################################################################################
nlpbook.set_logger(args)

INFO:ratsnlp:Training/evaluation parameters ClassificationTrainArguments(pretrained_model_name='pretrained/KcBERT-Base', downstream_task_name='document-classification', downstream_corpus_name='nsmc', downstream_corpus_root_dir='data', downstream_model_dir='checkpoints/nsmc', max_seq_length=128, save_top_k=1, monitor='min val_loss', seed=7, overwrite_cache=False, force_download=False, test_mode=False, learning_rate=5e-05, epochs=3, batch_size=32, cpu_workers=32, fp16=False, tpu_cores=0)


In [5]:
################################################################################
# 코드 4-7: 말뭉치 내려받기
################################################################################
from Korpora import Korpora

Korpora.fetch(
    corpus_name=args.downstream_corpus_name,
    root_dir=args.downstream_corpus_root_dir,
    # force_download=True
)

[Korpora] Corpus `nsmc` is already installed at /dat/proj/DeepKorean-23.03/data/nsmc/ratings_train.txt
[Korpora] Corpus `nsmc` is already installed at /dat/proj/DeepKorean-23.03/data/nsmc/ratings_test.txt


In [6]:
################################################################################
# 코드 4-8: 토크나이저 준비
################################################################################
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained(
    args.pretrained_model_name,
    do_lower_case=False,
)
print(tokenizer.tokenize("안녕하세요. 반갑습니다."))
tokenizer

['안녕', '##하세요', '.', '반', '##갑', '##습니다', '.']


PreTrainedTokenizer(name_or_path='pretrained/KcBERT-Base', vocab_size=30000, model_max_len=300, is_fast=False, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [7]:
################################################################################
# 코드 4-10: 학습 데이터 로더 구축
################################################################################
from ratsnlp.nlpbook.classification import NsmcCorpus, ClassificationDataset
from torch.utils.data import DataLoader, RandomSampler

corpus = NsmcCorpus()
train_dataset = ClassificationDataset(
    args=args,
    corpus=corpus,
    tokenizer=tokenizer,
    mode="train",
)

train_dataloader = DataLoader(
    train_dataset,
    batch_size=args.batch_size,
    sampler=RandomSampler(train_dataset, replacement=False),
    collate_fn=nlpbook.data_collator,
    drop_last=False,
    num_workers=args.cpu_workers,
)

INFO:ratsnlp:Loading features from cached file data/nsmc/cached_train_BertTokenizer_128_nsmc_document-classification [took 6.936 s]


In [8]:
################################################################################
# 코드 4-11: 평가용 데이터 로더 구축
################################################################################
from torch.utils.data import SequentialSampler

val_dataset = ClassificationDataset(
    args=args,
    corpus=corpus,
    tokenizer=tokenizer,
    mode="test",
)

val_dataloader = DataLoader(
    val_dataset,
    batch_size=args.batch_size,
    sampler=SequentialSampler(val_dataset),
    collate_fn=nlpbook.data_collator,
    drop_last=False,
    num_workers=args.cpu_workers,
)

INFO:ratsnlp:Loading features from cached file data/nsmc/cached_test_BertTokenizer_128_nsmc_document-classification [took 3.089 s]


In [9]:
################################################################################
# 코드 4-12: 모델 초기화
################################################################################
from transformers import BertConfig, BertForSequenceClassification

pretrained_model_config = BertConfig.from_pretrained(
    args.pretrained_model_name,
    num_labels=corpus.num_labels,
)
model = BertForSequenceClassification.from_pretrained(
    args.pretrained_model_name,
    config=pretrained_model_config,
)

Some weights of the model checkpoint at pretrained/KcBERT-Base were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not ini

In [10]:
################################################################################
# 코드 4-13: TASK 정의
################################################################################
from ratsnlp.nlpbook.classification import ClassificationTask

task = ClassificationTask(model, args)

In [11]:
################################################################################
# 코드 4-14: 트레이너 정의
################################################################################
trainer = nlpbook.get_trainer(args)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [12]:
################################################################################
# 코드 4-15: 학습 개시
################################################################################
trainer.fit(
    task,
    train_dataloaders=train_dataloader,
    val_dataloaders=val_dataloader,
)

Missing logger folder: /dat/proj/DeepKorean-23.03/checkpoints/nsmc/lightning_logs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]
  rank_zero_warn(

  | Name  | Type                          | Params
--------------------------------------------------------
0 | model | BertForSequenceClassification | 108 M 
--------------------------------------------------------
108 M     Trainable params
0         Non-trainable params
108 M     Total params
435.680   Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]