In [1]:
!pip install -q "transformers>=4.38.0" "datasets>=2.18.0" "peft>=0.11.0" accelerate huggingface_hub

In [2]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
from datasets import load_dataset

dataset_id = "dhkang01/KMA_dataset"
raw_ds = load_dataset(dataset_id, split="train")

raw_ds

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/340 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/4.74k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/36 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'input', 'output'],
    num_rows: 36
})

In [4]:
ds_train_val = raw_ds.train_test_split(test_size=0.05, seed=42)
train_ds = ds_train_val["train"]
val_ds   = ds_train_val["test"]


Tokenizer 다운로드

In [6]:
# KoCharElectra 토크나이저 스크립트 다운로드
!wget -q -nc https://raw.githubusercontent.com/monologg/KoCharELECTRA/master/tokenization_kocharelectra.py
!wget -q -nc https://raw.githubusercontent.com/monologg/KoCharELECTRA/master/vocab.txt

from tokenization_kocharelectra import KoCharElectraTokenizer

model_name = "monologg/kocharelectra-base-discriminator"

tokenizer = KoCharElectraTokenizer.from_pretrained(model_name)
print(tokenizer.tokenize("가나다"))

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'ElectraTokenizer'. 
The class this function is called from is 'KoCharElectraTokenizer'.


AttributeError: KoCharElectraTokenizer has no attribute vocab

output vocab

In [None]:
from collections import OrderedDict

pron2id = OrderedDict()
for c in range(ord("가"), ord("힣") + 1):
    pron2id[chr(c)] = c - ord("가")

pron2id = build_pron_vocab(raw_ds)   # 전체 데이터 기준으로 vocab 생성
id2pron = {v: k for k, v in pron2id.items()}

len(pron2id), list(list(pron2id.items())[:10])

전처리 함수 정의 및 적용

In [None]:
import numpy as np

max_length = 128  # 필요에 따라 조절

def preprocess_example(example):
    text = example["input"]
    pron = example["pron"]  # List[List[str]]

    # KoCharElectra는 char 단위 토큰 + [CLS], [SEP]
    encoding = tokenizer(
        text,
        truncation=True,
        max_length=max_length,
        padding="max_length",  # DataCollator 써도 되지만 여기서는 고정 길이로
        return_tensors=None,
    )

    input_ids = encoding["input_ids"]
    attention_mask = encoding["attention_mask"]

    # Electra/KoCharElectra: 대체로 [CLS] + chars + [SEP]
    # => 실제 문자 수 = len(text)
    # => pron 길이와 len(text)가 맞는다고 가정 (안 맞는 샘플은 나중에 필터 가능)
    seq_len = sum(attention_mask)  # 실제 non-pad 길이
    # [CLS] at 0, [SEP] at seq_len-1, chars in 1..seq_len-2

    labels = np.full_like(input_ids, fill_value=-100)  # default ignore_index

    # 문자 수와 pron 길이 안 맞으면 그냥 전부 ignore(-100)로 두고 스킵되게 할 수도 있음
    # 여기선 일단 최소한으로만 체크
    n_chars = seq_len - 2  # CLS, SEP 제외

    if len(pron) != n_chars:
        # 불일치하는 경우: 전부 padding label로 두고, 나중에 이런 샘플 비율 보고 판단
        # print(f"Warning: pron len {len(pron)} != n_chars {n_chars} for text: {text}")
        encoding["labels"] = labels.tolist()
        return encoding

    for i in range(len(pron)):
        cand_list = pron[i]
        if not cand_list:
            continue
        gold_pron = cand_list[0]               # 첫 후보를 gold label로 사용
        label_id = pron2id[gold_pron]
        token_pos = 1 + i                      # 0: [CLS], 1.. : chars
        if token_pos < seq_len - 1:            # 마지막 [SEP] 전까지만
            labels[token_pos] = label_id

    encoding["labels"] = labels.tolist()
    return encoding

In [None]:
train_tokenized = train_ds.map(
    preprocess_example,
    remove_columns=train_ds.column_names,
)

val_tokenized = val_ds.map(
    preprocess_example,
    remove_columns=val_ds.column_names,
)

train_tokenized[0]


모델 로드, LoRA 적용

In [None]:
from transformers import AutoModelForTokenClassification

num_labels = len(pron2id)

base_model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
)


In [None]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    task_type=TaskType.TOKEN_CLS,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    target_modules=["query", "key", "value", "dense"]  # Electra의 attention/FFN 모듈 이름 기준
)

model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters()


In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)


In [None]:
import evaluate
import numpy as np

accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    # ignore_index = -100 제거 후 accuracy 계산
    mask = labels != -100
    y_true = labels[mask]
    y_pred = predictions[mask]

    if len(y_true) == 0:
        return {"accuracy": 0.0}

    result = accuracy_metric.compute(predictions=y_pred, references=y_true)
    return {"accuracy": result["accuracy"]}


In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="kocharelectra-pron-lora",
    learning_rate=5e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    logging_steps=100,
    fp16=True,          # GPU가 지원하면 속도↑
    report_to="none",   # wandb 등 안 쓸 거면 none
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


In [None]:
trainer.train()


In [None]:
save_dir = "kocharelectra-pron-lora-adapter"
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)

# 발음 vocab도 같이 저장
import json, os
with open(os.path.join(save_dir, "pron_vocab.json"), "w", encoding="utf-8") as f:
    json.dump(pron2id, f, ensure_ascii=False, indent=2)
