In [1]:
# torch 2.4에서만 해당하는 버전
!apt-get update
!apt-get install -y ffmpeg
!pip install torchcodec==0.3.0
!pip install datasets==3.6.0
!pip install librosa soundfile jiwer evaluate huggingface_hub peft Levenshtein

Hit:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:2 http://archive.ubuntu.com/ubuntu jammy InRelease                         
Hit:3 http://security.ubuntu.com/ubuntu jammy-security InRelease               
Hit:4 http://archive.ubuntu.com/ubuntu jammy-updates InRelease                 
Hit:5 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:6 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Reading package lists... Done
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 122 not upgraded.
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
[0m


In [2]:

import torch
import random
from dataclasses import dataclass
from typing import Any, Dict, List, Union

from huggingface_hub import login
from datasets import load_dataset, Audio
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

from transformers import (
    WhisperFeatureExtractor,
    WhisperTokenizer,
    WhisperProcessor,
    WhisperForConditionalGeneration,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)

import evaluate

In [3]:
# 랜덤 시드 설정
RANDOM_SEED = 8282
random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(RANDOM_SEED)
print(f"랜덤 시드가 {RANDOM_SEED}로 설정되었습니다.")

랜덤 시드가 8282로 설정되었습니다.


In [4]:
datasets = load_dataset("daje/korean-address-voice-v2")

print(datasets)

DatasetDict({
    train: Dataset({
        features: ['audio', 'text'],
        num_rows: 3400
    })
    test: Dataset({
        features: ['audio', 'text'],
        num_rows: 340
    })
})


In [5]:
MODEL_NAME = "openai/whisper-large-v3-turbo"
feature_extractor = WhisperFeatureExtractor.from_pretrained(MODEL_NAME)
tokenizer = WhisperTokenizer.from_pretrained(MODEL_NAME, language="Korean", task="transcribe")
processor = WhisperProcessor.from_pretrained(MODEL_NAME, language="Korean", task="transcribe")

In [6]:
input_str = datasets["train"][0]["text"]
input_str

'서울특별시영등포구압구정로136 SK뷰293동3047호'

In [7]:
labels = tokenizer(input_str).input_ids
decoded_with_special = tokenizer.decode(labels, skip_special_tokens=False)
decoded_str = tokenizer.decode(labels, skip_special_tokens=True)

print(f"입력:                {input_str}")
print(f"특수토큰 포함 디코딩:    {decoded_with_special}")
print(f"특수토큰 제외 디코딩:    {decoded_str}")
print(f"원본과 동일 여부:       {input_str == decoded_str}")

입력:                서울특별시영등포구압구정로136 SK뷰293동3047호
특수토큰 포함 디코딩:    <|startoftranscript|><|ko|><|transcribe|><|notimestamps|>서울특별시영등포구압구정로136 SK뷰293동3047호<|endoftext|>
특수토큰 제외 디코딩:    서울특별시영등포구압구정로136 SK뷰293동3047호
원본과 동일 여부:       True


In [8]:
print(datasets["train"][0])

{'audio': {'path': '서울특별시영등포구압구정로136_SK뷰293동3047호.mp3', 'array': array([0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
       2.81896450e-06, 2.82983729e-06, 8.46276862e-08]), 'sampling_rate': 24000}, 'text': '서울특별시영등포구압구정로136 SK뷰293동3047호'}


In [9]:
def prepare_dataset(batch):
    # 오디오 데이터를 로드하고 48kHz에서 16kHz로 리샘플링
    audio = batch["audio"]

    # 입력 오디오 배열에서 log-Mel 입력 특징 계산
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # 타겟 텍스트를 레이블 ID로 인코딩
    batch["labels"] = tokenizer(batch["text"]).input_ids
    return batch

In [10]:
# 문제 원인

# Whisper 모델: 16,000 Hz (16kHz) 샘플링 레이트로 학습됨
# 현재 오디오 데이터: 24,000 Hz (24kHz)로 되어 있음

# Google TTS로 생성한 오디오가 24kHz로 되어 있어서, Whisper가 요구하는 16kHz와 맞지 않습니다.

# 데이터셋 로드 시 자동으로 16kHz로 리샘플링
datasets = datasets.cast_column("audio", Audio(sampling_rate=16000))

# 그 다음 prepare_dataset 실행
def prepare_dataset(batch):
    audio = batch["audio"]
    # 이미 16kHz로 리샘플링되어 있음
    batch["input_features"] = feature_extractor(
        audio["array"], 
        sampling_rate=16000  # 명시적으로 16000 사용
    ).input_features[0]
    batch["labels"] = tokenizer(batch["text"]).input_ids
    return batch

datasets = datasets.map(
    prepare_dataset, 
    remove_columns=datasets.column_names["train"], 
    num_proc=16
)


In [11]:
print("모델 로드 중...")
model = WhisperForConditionalGeneration.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    use_cache=False,  # gradient checkpointing과 호환되도록 설정
)

# 한국어 설정
model.generation_config.language = "ko"
model.generation_config.task = "transcribe"
model.generation_config.forced_decoder_ids = None

# 2. LoRA 설정
lora_config = LoraConfig(
    r=32,
    lora_alpha=64,
    target_modules=["q_proj", "v_proj", "k_proj", "out_proj"],
    lora_dropout=0.05,
    bias="none"
)

# 3. LoRA 모델로 변환
model = get_peft_model(model, lora_config)

# 4. 학습 가능한 파라미터 확인
model.print_trainable_parameters()
# gradient checkpointing 명시적 비활성화 (LoRA와 충돌 방지)
model.config.use_cache = False

모델 로드 중...
trainable params: 13,107,200 || all params: 821,985,280 || trainable%: 1.5946


In [12]:
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # 입력과 레이블은 길이가 다르고 서로 다른 패딩 방법이 필요하므로 분리하여 처리
        # 먼저 오디오 입력을 torch 텐서로 반환하여 처리
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # 토크나이징된 레이블 시퀀스 가져오기
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # 레이블을 최대 길이에 맞춰 패딩
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # 손실 계산 시 패딩을 올바르게 무시하도록 패딩 부분을 -100으로 교체
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # 이전 토크나이징 단계에서 BOS 토큰이 추가된 경우,
        # 나중에 어차피 추가되므로 여기서는 BOS 토큰 제거
        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

In [13]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=model.config.decoder_start_token_id,
)

In [14]:
# 테스트를 위해 2개의 샘플 준비 (전처리된 데이터에서)
sample_features = [
    datasets["train"][0],  # 첫 번째 샘플
    datasets["train"][1],  # 두 번째 샘플
]

# 콜레이터 적용
batch = data_collator(sample_features)

print("=== 배치 구조 ===")
print(f"input_features 크기: {batch['input_features'].shape}")
print(f"labels 크기: {batch['labels'].shape}")

print("\n=== 첫 번째 샘플의 레이블 (처음 20개 토큰) ===")
print(batch['labels'][0][:20])

print("\n=== 두 번째 샘플의 레이블 (처음 20개 토큰) ===")
print(batch['labels'][1][:20])

print("\n=== 패딩 확인 (레이블의 마지막 10개 토큰) ===")
print(f"첫 번째 샘플: {batch['labels'][0][-10:]}")
print(f"두 번째 샘플: {batch['labels'][1][-10:]}")

You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


=== 배치 구조 ===
input_features 크기: torch.Size([2, 128, 3000])
labels 크기: torch.Size([2, 30])

=== 첫 번째 샘플의 레이블 (처음 20개 토큰) ===
tensor([50264, 50360, 50364,  2393, 15580,  5963,   117, 37604,  3833, 11958,
        36912, 30600,  7675,  1457,   243,  7675,  6170, 12888,  7668,    21])

=== 두 번째 샘플의 레이블 (처음 20개 토큰) ===
tensor([50264, 50360, 50364,  2393, 15580,  5963,   117, 37604,  3833, 36074,
         8097, 11545,  9520,  1831,  6826, 16270,  5254, 15390, 12504,  4264])

=== 패딩 확인 (레이블의 마지막 10개 토큰) ===
첫 번째 샘플: tensor([21483,   167, 33067, 11871,    18, 23056,  3446, 14060, 14705, 50257])
두 번째 샘플: tensor([ 1129, 22452,    17, 23056,    19,  5211,    24, 14705, 50257,  -100])


In [15]:
# 평가 메트릭 (WER + CER)
wer_metric = evaluate.load("wer")
cer_metric = evaluate.load("cer")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # -100을 pad_token_id로 교체
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id

    # 디코딩
    pred_str = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    # WER 계산
    wer = 100 * wer_metric.compute(predictions=pred_str, references=label_str)
    
    # CER 계산 (한국어에 특히 유용)
    cer = 100 * cer_metric.compute(predictions=pred_str, references=label_str)
    
    # 간단한 문자 정확도 (한국어용)
    char_acc = calculate_korean_char_accuracy(pred_str, label_str)
    
    return {
        "wer": wer,
        "cer": cer,
        "char_accuracy": char_acc
    }

In [16]:
def calculate_korean_char_accuracy(predictions, references):
    """한국어 문자 단위 정확도 계산"""
    total_chars = 0
    correct_chars = 0
    
    for pred, ref in zip(predictions, references):
        # 공백 제거 (한국어 띄어쓰기 오류 영향 최소화)
        pred_chars = list(pred.replace(" ", ""))
        ref_chars = list(ref.replace(" ", ""))
        
        # 최소 길이까지 비교
        min_len = min(len(pred_chars), len(ref_chars))
        for i in range(min_len):
            if i < len(pred_chars) and pred_chars[i] == ref_chars[i]:
                correct_chars += 1
        
        total_chars += len(ref_chars)
    
    return (correct_chars / total_chars * 100) if total_chars > 0 else 0

In [17]:
OUTPUT_DIR = "./model-v2"

In [18]:
training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=2,
    learning_rate=1e-4,
    warmup_ratio=0.1,
    max_steps=100,
    fp16=True,
    per_device_eval_batch_size=64,
    eval_strategy="steps",
    eval_steps=10,
    generation_max_length=256,
    save_strategy="steps",
    save_steps=10,
    save_total_limit=5,
    logging_strategy="steps",
    logging_steps=10,
    predict_with_generate=True,
    metric_for_best_model="cer",
    greater_is_better=False,
    load_best_model_at_end=True,
    remove_unused_columns=False,
    label_names=["labels"],
)

In [19]:
trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=datasets["train"],
    eval_dataset=datasets["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

  trainer = Seq2SeqTrainer(
The model is already on multiple devices. Skipping the move to device specified in `args`.


In [20]:
trainer.train()

Step,Training Loss,Validation Loss,Wer,Cer,Char Accuracy
10,0.634,0.323696,26.176471,2.534609,95.265711
20,0.1452,0.05529,11.176471,0.926303,98.797976
30,0.0369,0.018848,3.823529,0.346091,99.283003
40,0.0163,0.011287,2.058824,0.203583,99.620413
50,0.0123,0.008327,0.735294,0.050896,99.94728
60,0.0083,0.00661,0.441176,0.030537,99.968368
70,0.006,0.005869,0.294118,0.020358,99.978912
80,0.006,0.005428,0.294118,0.020358,99.978912
90,0.006,0.005174,0.294118,0.020358,99.978912
100,0.0058,0.00507,0.294118,0.020358,99.978912


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


TrainOutput(global_step=100, training_loss=0.08768495686352253, metrics={'train_runtime': 2332.1134, 'train_samples_per_second': 2.744, 'train_steps_per_second': 0.043, 'total_flos': 1.100779249729536e+19, 'train_loss': 0.08768495686352253, 'epoch': 1.8598130841121496})