In [1]:
import math
import logging
from datetime import datetime

import torch
from torch.utils.data import DataLoader
from datasets import load_dataset
from sentence_transformers import SentenceTransformer,  LoggingHandler, losses, models, util
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import InputExample

In [2]:
model_name = "klue/roberta-base"
#model_name= 'distiluse-base-multilingual-cased-v1'  # 사전 학습된 언어 모델

train_batch_size = 32 
num_epochs = 4   # 에포크 횟수

model_save_path = "output/training_klue_sts_" + model_name.replace("/", "-") + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
embedding_model = models.Transformer(model_name)
#embedding_model = SentenceTransformer(model_name)

Some weights of the model checkpoint at klue/roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for

In [3]:
# pooler: Embedder 에서 추출된 토큰 단위 임베딩들을 가지고 문장 임베딩을 어떻게 계산할 것인지를 결정
# Max pooling, Mean pooling등 다양한 방법이 있음
# 여기서는 Mean pooling 사용

pooler = models.Pooling(
    embedding_model.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True,
    pooling_mode_cls_token=False,
    pooling_mode_max_tokens=False,
)

In [4]:
model = SentenceTransformer(modules=[embedding_model, pooler])

In [5]:
# 데이터셋 가져오기
sts_data = load_dataset("kor_nlu", "sts")

Reusing dataset kor_nlu (C:\Users\ing06\.cache\huggingface\datasets\kor_nlu\sts\1.0.0\4facbba77df60b0658056ced2052633e681a50187b9428bd5752ebd59d332ba8)


  0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
# korSTS 데이터는 train, val, test데이터로 이루어져 있다
print(sts_data)

DatasetDict({
    train: Dataset({
        features: ['genre', 'filename', 'year', 'id', 'score', 'sentence1', 'sentence2'],
        num_rows: 5703
    })
    validation: Dataset({
        features: ['genre', 'filename', 'year', 'id', 'score', 'sentence1', 'sentence2'],
        num_rows: 1471
    })
    test: Dataset({
        features: ['genre', 'filename', 'year', 'id', 'score', 'sentence1', 'sentence2'],
        num_rows: 1379
    })
})


In [7]:
sts_data["train"][50]   # 두 문장, 문장 간 유사도를 가지고 있음

{'genre': 1,
 'filename': 2,
 'year': 6,
 'id': 73,
 'score': 3.200000047683716,
 'sentence1': '남자가 기타를 치고 있다.',
 'sentence2': '한 소년이 기타를 치고 있다.'}

In [8]:
# sts 데이터셋을 sentence-transformers 훈련 양식에 맞게 변환해주는 작업
train_samples = []
dev_samples = []
test_samples = []


# KorSTS 내 테스트 데이터 예제 변환
for phase in ["train", "validation", "test"]:
    examples = sts_data[phase]

    for example in examples:
        score = float(example["score"]) / 5.0  # 0.0 ~ 1.0 스케일로 유사도 정규화

        inp_example = InputExample(
            texts=[example["sentence1"], example["sentence2"]], 
            label=score,
        )

        if phase == "train":
            train_samples.append(inp_example)
        elif phase == "validation":
            dev_samples.append(inp_example)
        else:
            test_samples.append(inp_example)

In [9]:
# 변환한 데이터 확인
train_samples[0].texts, train_samples[0].label

(['비행기가 이륙하고 있다.', '비행기가 이륙하고 있다.'], 1.0)

In [10]:
test_samples[0].texts, test_samples[0].label

(['한 소녀가 머리를 스타일링하고 있다.', '한 소녀가 머리를 빗고 있다.'], 0.5)

In [11]:
# 데이터 로더와 loss 설정
# CosineSimilarityLoss는 입력된 두 문장의 임베딩 간 코사인 유사도와 골드 라벨 간 차이를 통해 계산

train_dataloader = DataLoader(
    train_samples,
    shuffle=True,
    batch_size=train_batch_size,
)
train_loss = losses.CosineSimilarityLoss(model=model)

In [12]:
# 모델 검증에 사용할 evaluator
# 앞서 만든 dev_samples 이용
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
    dev_samples,
    name="sts-dev",
)

In [13]:
# Warm up Steps를 설정
# 훈련 배치 수의 10% 만큼으로 값을 설정
warmup_steps = math.ceil(len(train_dataloader) * num_epochs  * 0.1)  # 10% of train data for warm-up

In [14]:
# 드디어 모델 훈련
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=evaluator,
    epochs=num_epochs,
    evaluation_steps=1000,
    warmup_steps=warmup_steps,
    output_path=model_save_path,
)



Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Iteration:   0%|          | 0/179 [00:00<?, ?it/s]

RuntimeError: [enforce fail at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\c10\core\impl\alloc_cpu.cpp:81] data. DefaultCPUAllocator: not enough memory: you tried to allocate 402653184 bytes.