In [1]:
%cd /content
!rm -rf KoreanStandardPronunciation
!git clone https://github.com/dhkang01/KoreanStandardPronunciation.git
%cd KoreanStandardPronunciation


/content
Cloning into 'KoreanStandardPronunciation'...
remote: Enumerating objects: 19, done.[K
remote: Counting objects: 100% (19/19), done.[K
remote: Compressing objects: 100% (13/13), done.[K
remote: Total 19 (delta 5), reused 19 (delta 5), pack-reused 0 (from 0)[K
Receiving objects: 100% (19/19), 65.16 KiB | 1.05 MiB/s, done.
Resolving deltas: 100% (5/5), done.
/content/KoreanStandardPronunciation


In [None]:
!pip install -q "transformers>=4.38.0" "datasets>=2.18.0" "peft>=0.11.0" accelerate huggingface_hub evaluate

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
from datasets import load_dataset

dataset_id = "dhkang01/KMA_dataset"
raw_ds = load_dataset(dataset_id, split="train")

raw_ds

In [None]:
ds_train_val = raw_ds.train_test_split(test_size=0.05, seed=42)
train_ds = ds_train_val["train"]
val_ds   = ds_train_val["test"]


Tokenizer 다운로드

In [None]:
from KoCharELECTRA.tokenization_kocharelectra import KoCharElectraTokenizer

model_name = "monologg/kocharelectra-small-discriminator"

tokenizer = KoCharElectraTokenizer.from_pretrained(model_name)
print(tokenizer.tokenize("가나다"))

output vocab

In [None]:
from collections import OrderedDict

# tokenizer.vocab은 OrderedDict(토큰 → ID)
token_list = list(tokenizer.vocab.keys())

pron2id = OrderedDict()
for idx, tok in enumerate(token_list):
    pron2id[tok] = idx

id2pron = {v: k for k, v in pron2id.items()}

len(pron2id), list(list(pron2id.items())[:10])

전처리 함수 정의 및 적용

복수 발음 허용 X

In [None]:
import numpy as np

max_length = 128  # 필요에 따라 조절

def preprocess_example(example):
    text = example["input"]
    pron = example["output"]  # List[List[str]]

    # KoCharElectra는 char 단위 토큰 + [CLS], [SEP]
    encoding = tokenizer(
        text,
        truncation=True,
        max_length=max_length,
        padding="max_length",  # DataCollator 써도 되지만 여기서는 고정 길이로
        return_tensors=None,
    )

    input_ids = encoding["input_ids"]
    attention_mask = encoding["attention_mask"]

    # Electra/KoCharElectra: 대체로 [CLS] + chars + [SEP]
    # => 실제 문자 수 = len(text)
    # => pron 길이와 len(text)가 맞는다고 가정 (안 맞는 샘플은 나중에 필터 가능)
    seq_len = sum(attention_mask)  # 실제 non-pad 길이
    # [CLS] at 0, [SEP] at seq_len-1, chars in 1..seq_len-2

    labels = np.full_like(input_ids, fill_value=-100)  # default ignore_index

    # 문자 수와 pron 길이 안 맞으면 그냥 전부 ignore(-100)로 두고 스킵되게 할 수도 있음
    # 여기선 일단 최소한으로만 체크
    n_chars = seq_len - 2  # CLS, SEP 제외

    if len(pron) != n_chars:
        # 불일치하는 경우: 전부 padding label로 두고, 나중에 이런 샘플 비율 보고 판단
        print(f"Warning: pron len {len(pron)} != n_chars {n_chars} for text: {text}")
        print(f"in the case, pron: {"".join([l[0] for l in pron])}")
        encoding["labels"] = labels.tolist()
        return encoding

    for i in range(len(pron)):
        cand_list = pron[i]
        if not cand_list:
            continue
        gold_pron = cand_list[0]               # 첫 후보를 gold label로 사용
        label_id = pron2id[gold_pron]
        token_pos = 1 + i                      # 0: [CLS], 1.. : chars
        if token_pos < seq_len - 1:            # 마지막 [SEP] 전까지만
            labels[token_pos] = label_id

    encoding["labels"] = labels.tolist()
    return encoding

In [None]:
train_tokenized = train_ds.map(
    preprocess_example,
    remove_columns=train_ds.column_names,
)

val_tokenized = val_ds.map(
    preprocess_example,
    remove_columns=val_ds.column_names,
)

# too long seq is out.

train_tokenized[0]


모델 로드, LoRA 적용

encoder에 LoRA적용
classifier에 LoRA적용X, 전부 trainable

In [None]:
from transformers import AutoModelForTokenClassification

num_labels = len(pron2id)

base_model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
)


In [None]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    task_type=TaskType.TOKEN_CLS,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    target_modules=["query", "key", "value", "dense"]  # Electra의 attention/FFN 모듈 이름 기준
)

# print(base_model)
model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters()


Concat wrapper 모듈 적용

In [None]:
import torch
import torch.nn as nn

class ElectraConcatEmbeddingClassifier(nn.Module):
    def __init__(self, peft_model, num_labels):
        super().__init__()
        self.peft_model = peft_model  # PeftModelForTokenClassification
        # base_model = ElectraForTokenClassification
        self.base_model = peft_model.base_model

        self.electra = self.base_model.electra   # ElectraModel (LoRA 포함)
        self.num_labels = num_labels

        config = self.base_model.config
        hidden_size = config.hidden_size          # 256
        embed_dim = self.electra.embeddings.word_embeddings.embedding_dim  # 128

        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        concat_dim = hidden_size + embed_dim      # 256 + 128 = 384

        # 새 classifier: [encoder_hidden; embedding] → num_labels
        self.classifier = nn.Linear(concat_dim, num_labels)

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        labels=None,
        **kwargs,
    ):
        # 1) input embedding
        embedding_output = self.electra.embeddings(
            input_ids=input_ids,
            token_type_ids=token_type_ids,
        )  # (B, L, embed_dim)

        # 2) encoder 출력 (LoRA가 여기에 이미 적용됨)
        encoder_outputs = self.electra(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
        )
        sequence_output = encoder_outputs.last_hidden_state  # (B, L, hidden_size)

        # 3) concat
        concat = torch.cat([sequence_output, embedding_output], dim=-1)  # (B, L, concat_dim)
        concat = self.dropout(concat)

        # 4) classifier
        logits = self.classifier(concat)  # (B, L, num_labels)

        loss = None
        if labels is not None:
            # labels: (B, L), ignore_index = -100
            loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
            loss = loss_fct(
                logits.view(-1, self.num_labels),
                labels.view(-1),
            )

        # Trainer는 dict 또는 ModelOutput을 받아도 됨
        return {"loss": loss, "logits": logits}

    def state_dict(self, *args, **kwargs):
        # PEFT 모델의 state_dict만 사용 (중복 경로 제거)
        peft_sd = self.peft_model.state_dict(*args, **kwargs)
        # 우리 커스텀 classifier도 같이 저장하려면 key를 얹어서 합쳐줌
        my_sd = {k: v for k, v in super().state_dict(*args, **kwargs).items()
                 if k.startswith("classifier.")}
        peft_sd.update({f"concat_head.{k}": v for k, v in my_sd.items()})
        return peft_sd

    def load_state_dict(self, state_dict, strict=True):
        # 저장할 때 "concat_head.classifier.xxx"로 넣었으니 다시 분리
        my_state = {}
        peft_state = {}
        for k, v in state_dict.items():
            if k.startswith("concat_head.classifier."):
                my_state[k.replace("concat_head.", "")] = v
            else:
                peft_state[k] = v
        # PEFT 쪽 로드
        self.peft_model.load_state_dict(peft_state, strict=False)
        # 우리 classifier 로드
        super().load_state_dict(my_state, strict=False)

concat_model = ElectraConcatEmbeddingClassifier(model, num_labels=num_labels)

학습 진행

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)


In [None]:
import evaluate
import numpy as np

accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    # ignore_index = -100 제거 후 accuracy 계산
    mask = labels != -100
    y_true = labels[mask]
    y_pred = predictions[mask]

    if len(y_true) == 0:
        return {"accuracy": 0.0}

    result = accuracy_metric.compute(predictions=y_pred, references=y_true)
    return {"accuracy": result["accuracy"]}


In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="kocharelectra-pron-lora",
    learning_rate=5e-5,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    logging_steps=100,
    fp16=True,          # GPU가 지원하면 속도↑
    report_to="none",   # wandb 등 안 쓸 거면 none
)

trainer = Trainer(
    model=concat_model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


In [None]:
trainer.train()


In [None]:
save_dir = "kocharelectra-pron-lora-adapter"
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)

# 발음 vocab도 같이 저장
import json, os
with open(os.path.join(save_dir, "pron_vocab.json"), "w", encoding="utf-8") as f:
    json.dump(pron2id, f, ensure_ascii=False, indent=2)
