# 각종 설정
모델 하이퍼파라메터(hyperparameter)와 저장 위치 등 설정 정보를 선언합니다.

In [1]:
from chrisbase.util import to_dataframe
from chrislab.common.util import GpuProjectEnv
from ratsnlp.nlpbook.classification import ClassificationTrainArguments

env = GpuProjectEnv(project_name="DeepKorean", working_gpus="0")
args = ClassificationTrainArguments(
    working_config_file=env.running_file.with_suffix('.json').name,
    pretrained_model_path="model/pretrained/KcBERT-Base",
    downstream_model_path="model/finetuned/nsmc",
    downstream_model_file="{epoch}-{val_loss:.3f}-{val_acc:.3f}",
    downstream_data_home="data",
    downstream_data_name="nsmc",
    monitor="max val_acc",
    learning_rate=5e-5,
    max_seq_length=128,
    cpu_workers=24,
    batch_size=360,
    save_top_k=3,
    epochs=3,
    seed=7,
)
config = args.save_working_config()
to_dataframe(env)

Unnamed: 0,key,value
0,hostname,dl012
1,hostaddr,129.254.182.78
2,python_path,/data/dlt/mambaforge/envs/DeepKorean-23.03/bin/python3.10
3,project_name,DeepKorean
4,project_path,/data/dlt/proj/DeepKorean-23.03
5,working_path,/data/dlt/proj/DeepKorean-23.03
6,running_file,tests/1-doc_cls-train.ipynb
7,working_gpus,0
8,number_of_gpus,1


In [2]:
from pathlib import Path

assert config.exists(), f"No config file: {config}"
config = Path(config)
args = ClassificationTrainArguments.from_json(config.read_text())
to_dataframe(args)

Unnamed: 0,key,value
0,pretrained_model_path,model/pretrained/KcBERT-Base
1,downstream_model_path,model/finetuned/nsmc
2,downstream_model_file,{epoch}-{val_loss:.3f}-{val_acc:.3f}
3,downstream_conf_file,1-doc_cls-train.json
4,downstream_data_home,data
5,downstream_data_name,nsmc
6,downstream_task_name,document-classification
7,max_seq_length,128
8,save_top_k,3
9,monitor,max val_acc


# 랜덤 시드 고정 및 로거 설정
학습 재현을 위해 랜덤 시드를 고정 후, 로거를 설정합니다

In [3]:
from ratsnlp import nlpbook

nlpbook.set_seed(args)
nlpbook.set_logger()

set seed: 7


# 말뭉치 다운로드
실습에 사용할 말뭉치를 다운로드합니다.

In [4]:
from Korpora import Korpora

Korpora.fetch(
    corpus_name=args.downstream_data_name,
    root_dir=args.downstream_data_home,
)

[Korpora] Corpus `nsmc` is already installed at /data/dlt/proj/DeepKorean-23.03/data/nsmc/ratings_train.txt
[Korpora] Corpus `nsmc` is already installed at /data/dlt/proj/DeepKorean-23.03/data/nsmc/ratings_test.txt


# 토크나이저 준비
토큰화를 수행하는 토크나이저를 선언합니다

In [5]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained(
    args.pretrained_model_path,
    do_lower_case=False,
)
print(f"tokenized={tokenizer.tokenize('안녕하세요. 반갑습니다.')}")
tokenizer

tokenized=['안녕', '##하세요', '.', '반', '##갑', '##습니다', '.']


BertTokenizer(name_or_path='model/pretrained/KcBERT-Base', vocab_size=30000, model_max_length=300, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

# 학습데이터 구축
학습데이터를 구축합니다.

In [6]:
from ratsnlp.nlpbook.classification import NsmcCorpus, ClassificationDataset
from torch.utils.data import DataLoader, RandomSampler

corpus = NsmcCorpus()
train_dataset = ClassificationDataset(
    args=args,
    corpus=corpus,
    tokenizer=tokenizer,
    mode="train",
)
train_dataloader = DataLoader(
    train_dataset,
    batch_size=args.batch_size,
    sampler=RandomSampler(train_dataset, replacement=False),
    collate_fn=nlpbook.data_collator,
    drop_last=False,
    num_workers=args.cpu_workers,
)

INFO:ratsnlp:Loading features from cached file data/nsmc/cached_train_BertTokenizer_128_nsmc_document-classification [took 21.369 s]


# 평가데이터 구축
학습 중에 사용할 평가데이터를 구축합니다.

In [7]:
from torch.utils.data import SequentialSampler

val_dataset = ClassificationDataset(
    args=args,
    corpus=corpus,
    tokenizer=tokenizer,
    mode="test",
)
val_dataloader = DataLoader(
    val_dataset,
    batch_size=args.batch_size,
    sampler=SequentialSampler(val_dataset),
    collate_fn=nlpbook.data_collator,
    drop_last=False,
    num_workers=args.cpu_workers,
)

INFO:ratsnlp:Loading features from cached file data/nsmc/cached_test_BertTokenizer_128_nsmc_document-classification [took 7.212 s]


# 모델 초기화
사전학습 모델을 읽고, 문서 분류를 수행할 모델을 초기화합니다.

In [8]:
from transformers import BertConfig, BertForSequenceClassification

pretrained_model_config = BertConfig.from_pretrained(
    args.pretrained_model_path,
    num_labels=corpus.num_labels,
)
model = BertForSequenceClassification.from_pretrained(
    args.pretrained_model_path,
    config=pretrained_model_config,
)

Some weights of the model checkpoint at model/pretrained/KcBERT-Base were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were n

# 학습 개시
준비한 데이터와 모델로 학습을 시작합니다.
학습 결과물은 미리 세팅한 위치(`args.downstream_model_path`)에 저장됩니다.

In [9]:
from chrisbase.io import JobTimer
from ratsnlp.nlpbook.classification import ClassificationTask
import torch

with JobTimer(f"TRAIN(config={config})", mt=1, mb=1, rt=1, rb=1, rc='=', verbose=True, flush_sec=0.3):
    torch.set_float32_matmul_precision('high')
    nlpbook.get_trainer(args).fit(
        ClassificationTask(model, args),
        train_dataloaders=train_dataloader,
        val_dataloaders=val_dataloader,
    )


[03.20 22:27:06] [INIT] TRAIN(config=model/finetuned/nsmc/1-doc_cls-train.json)


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Missing logger folder: /data/dlt/proj/DeepKorean-23.03/model/finetuned/nsmc/lightning_logs


  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                          | Params
--------------------------------------------------------
0 | model | BertForSequenceClassification | 108 M 
--------------------------------------------------------
108 M     Trainable params
0         Non-trainable params
108 M     Total params
435.680   Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=3` reached.


[03.20 22:41:23] [EXIT] TRAIN(config=model/finetuned/nsmc/1-doc_cls-train.json) ($=00:14:15.136)

