In [1]:
import re 
import json
from glob import glob 

import os
import torch
import random 
from typing import Any, Dict, List, Union
from tqdm.auto import tqdm
from huggingface_hub import login
from dataclasses import dataclass
from datasets import Dataset, load_dataset, DatasetDict, Audio


from peft import PeftModel, prepare_model_for_kbit_training, LoraConfig, get_peft_model

# import evaluate
from transformers import (
    WhisperFeatureExtractor,
    WhisperTokenizer,
    WhisperProcessor,
    WhisperForConditionalGeneration,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    AutoModelForSpeechSeq2Seq,
    BitsAndBytesConfig, 
    AutoProcessor, 
    pipeline
)

import evaluate 

In [2]:
datasets = load_dataset("daje/korean-address-voice-v2")

# 데이터셋 로드 시 자동으로 16kHz로 리샘플링
datasets = datasets.cast_column("audio", Audio(sampling_rate=16000))

In [3]:
MODEL_NAME = "openai/whisper-large-v3-turbo"
feature_extractor = WhisperFeatureExtractor.from_pretrained(MODEL_NAME)
tokenizer = WhisperTokenizer.from_pretrained(MODEL_NAME, language="Korean", task="transcribe")
processor = WhisperProcessor.from_pretrained(MODEL_NAME, language="Korean", task="transcribe")
base_model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME)
base_model = base_model.to("cuda")

In [4]:
# 오디오 가져오기
audio = datasets["test"][0]["audio"]

# 전처리
input_features = processor(
    audio["array"], 
    sampling_rate=audio["sampling_rate"], 
    return_tensors="pt"
).input_features

input_features = input_features.to("cuda")

# 추론
with torch.no_grad():
    predicted_ids = base_model.generate(input_features)

# 디코딩
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
print(transcription)

Using custom `forced_decoder_ids` from the (generation) config. This is deprecated in favor of the `task` and `language` flags/config options.
Transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English. This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`. See https://github.com/huggingface/transformers/pull/28687 for more details.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


 서울특별시 서초구 테헤란로 941, 현대아파트 553동 3722호.


In [5]:
answers = [] 
no_train_result = [] 
for idx in tqdm(range(len(datasets["test"]))):
    # 오디오 가져오기
    audio = datasets["test"][idx]["audio"]
    
    # 전처리
    input_features = processor(
        audio["array"], 
        sampling_rate=audio["sampling_rate"], 
        return_tensors="pt"
    ).input_features
    
    input_features = input_features.to("cuda")
    
    # 추론
    with torch.no_grad():
        predicted_ids = base_model.generate(input_features)
    
    # 디코딩
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
    no_train_result.append(transcription)
    answers.append(datasets["test"][idx]["text"])

  0%|          | 0/340 [00:00<?, ?it/s]

In [6]:
def normalize_address(text):
    """
    주소 텍스트를 정규화하여 비교 가능하게 만듭니다.
    """
    # 1. 소문자 변환 (영문이 섞인 경우 대비)
    text = text.lower()
    
    # 2. 구두점 제거 (쉼표, 마침표, 하이픈 등)
    text = re.sub(r'[,.\-]', '', text)
    
    # 3. 모든 공백 제거
    text = re.sub(r'\s+', '', text)
    
    return text

# 정규화 예시
# 정규화전 : 광주광역시 북구 첨단과기로 208, 첨단하이파크 605동 1902호.
# 정규화후: 광주광역시북구첨단과기로208 첨단아이파크605동1902호

In [7]:
# 3. LoRA 어댑터 로드
finetuned_model = PeftModel.from_pretrained(
    base_model, 
    "/workspace/model-v2/checkpoint-60"
)

# 4. GPU로 이동
finetuned_model = finetuned_model.to("cuda")

# 5. 한국어 설정
finetuned_model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(
    language="ko", 
    task="transcribe"
)

In [8]:
train_result = [] 
for idx in tqdm(range(len(datasets["test"]))):
    # 오디오 가져오기
    audio = datasets["test"][idx]["audio"]
    
    # 전처리
    input_features = processor(
        audio["array"], 
        sampling_rate=audio["sampling_rate"], 
        return_tensors="pt"
    ).input_features
    
    input_features = input_features.to("cuda")
    
    # 추론
    with torch.no_grad():
        predicted_ids = finetuned_model.generate(input_features)
    
    # 디코딩
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
    train_result.append(transcription)

  0%|          | 0/340 [00:00<?, ?it/s]

In [9]:
# Fine-tuning 전 - 틀린 케이스만 출력
print("="*80)
print("Fine-tuning 전 - 틀린 예측 (상위 10개)")
print("="*80)

count = 0
for idx in range(len(no_train_result)):
    pred = normalize_address(no_train_result[idx])
    answer = normalize_address(answers[idx])
    
    if pred != answer:
        count += 1
        print(f"\n[{count}]")
        print(f"정답: {answers[idx]}")
        print(f"예측: {no_train_result[idx]}")
        print("-"*80)
        
        if count >= 10:
            break

print(f"\n총 340개 중 {sum(1 for i in range(len(no_train_result)) if normalize_address(no_train_result[i]) != normalize_address(answers[i]))}개 틀림")


# Fine-tuning 후 - 틀린 케이스만 출력
print("\n\n" + "="*80)
print("Fine-tuning 후 - 틀린 예측 (상위 10개)")
print("="*80)

count = 0
for idx in range(len(train_result)):
    pred = normalize_address(train_result[idx])
    answer = normalize_address(answers[idx])
    
    if pred != answer:
        count += 1
        print(f"\n[{count}]")
        print(f"정답: {answers[idx]}")
        print(f"예측: {train_result[idx]}")
        print("-"*80)
        
        if count >= 10:
            break

print(f"\n340개 중 총 {sum(1 for i in range(len(train_result)) if normalize_address(train_result[i]) != normalize_address(answers[i]))}개 틀림")

Fine-tuning 전 - 틀린 예측 (상위 10개)

[1]
정답: 서울특별시은평구마포대로571 래미안198동392호
예측:  서울특별시 은평구 마포대로 572, 레미안 198동 392호.
--------------------------------------------------------------------------------

[2]
정답: 서울특별시노원구사당로302 엘에이치669동1290호
예측:  서울특별시 노원구 사당로 302, LH 669, 동 1290호.
--------------------------------------------------------------------------------

[3]
정답: 서울특별시서초구선릉로337 파크자이1087동550호
예측:  서울특별시 서초구 설릉로 337, 파크자이 1087동 550호.
--------------------------------------------------------------------------------

[4]
정답: 서울특별시노원구백제고분로755 호반베르디움983동4235호
예측:  서울특별시 노원구 백제고분노 755, 호반베르디움 983동 4,235호.
--------------------------------------------------------------------------------

[5]
정답: 서울특별시관악구강남대로150 대우아파트929동1637호
예측:  서울특별시 관악구 강남대로 150, 대호아파트 929동 1637호.
--------------------------------------------------------------------------------

[6]
정답: 서울특별시도봉구선릉로850 롯데캐슬498동1702호
예측:  서울특별시 도봉구 설릉로 850, 롯데캐슬 498, 동1702호.
------------------------------------------------------------------------------

In [10]:
cer_metric = evaluate.load("cer")
def calculate_cer(predictions, ground_truths):
    """
    Character Error Rate 계산 (Hugging Face evaluate 라이브러리 사용)
    """
    # 정규화된 텍스트로 CER 계산
    pred_normalized = [normalize_address(pred) for pred in predictions]
    gt_normalized = [normalize_address(gt) for gt in ground_truths]
    
    # CER 계산 (0~1 범위로 반환되므로 100을 곱해 퍼센트로 변환)
    cer = 100 * cer_metric.compute(predictions=pred_normalized, references=gt_normalized)
    
    return cer

cer_no_train = calculate_cer(no_train_result, answers)
cer_train = calculate_cer(train_result, answers)
print(f"Fine-tuning 전 CER: {cer_no_train:.2f}%")
print(f"Fine-tuning 후 CER: {cer_train:.2f}%")

Fine-tuning 전 CER: 3.32%
Fine-tuning 후 CER: 0.03%


In [11]:
metric = evaluate.load("wer")

def calculate_wer(predictions, ground_truths):
    """
    Word Error Rate 계산 (Hugging Face evaluate 라이브러리 사용)
    """
    # 정규화된 텍스트로 WER 계산
    pred_normalized = [pred for pred in predictions]
    gt_normalized = [gt for gt in ground_truths]
    
    # WER 계산 (0~1 범위로 반환되므로 100을 곱해 퍼센트로 변환)
    wer = 100 * metric.compute(predictions=pred_normalized, references=gt_normalized)
    
    return wer


# 사용법:
wer_no_train = calculate_wer(no_train_result, answers)
wer_train = calculate_wer(train_result, answers)
print(f"Fine-tuning 전 WER: {wer_no_train:.2f}%")
print(f"Fine-tuning 후 WER: {wer_train:.2f}%")

Fine-tuning 전 WER: 361.32%
Fine-tuning 후 WER: 0.44%


In [13]:
# 학습한 모델 허깅페이스에 업로드하기
merged_model = finetuned_model.merge_and_unload()
merged_model.push_to_hub("daje/whisper-v3-turbo-address")
processor.push_to_hub("daje/whisper-v3-turbo-address")

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/daje/whisper-v3-turbo-address/commit/b40320102e29039ebef62f3340df2264b088e959', commit_message='Upload processor', commit_description='', oid='b40320102e29039ebef62f3340df2264b088e959', pr_url=None, repo_url=RepoUrl('https://huggingface.co/daje/whisper-v3-turbo-address', endpoint='https://huggingface.co', repo_type='model', repo_id='daje/whisper-v3-turbo-address'), pr_revision=None, pr_num=None)