In [1]:
import torch
from datasets import load_dataset, Audio

# import evaluate
from transformers import (
    WhisperFeatureExtractor,
    WhisperTokenizer,
    WhisperProcessor,
    WhisperForConditionalGeneration,
)

import warnings
warnings.filterwarnings("ignore")

In [2]:
datasets = load_dataset("daje/korean-address-voice-v2")

# 데이터셋 로드 시 자동으로 16kHz로 리샘플링
datasets = datasets.cast_column("audio", Audio(sampling_rate=16000))

In [3]:
MODEL_NAME = "daje/whisper-v3-turbo-address"
feature_extractor = WhisperFeatureExtractor.from_pretrained(MODEL_NAME)
tokenizer = WhisperTokenizer.from_pretrained(MODEL_NAME, language="Korean", task="transcribe")
processor = WhisperProcessor.from_pretrained(MODEL_NAME, language="Korean", task="transcribe")
model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME)
model = model.to("cuda")

In [5]:
# 오디오 가져오기

audio = datasets["test"][0]["audio"]

# 전처리
input_features = processor(
    audio["array"], 
    sampling_rate=audio["sampling_rate"], 
    return_tensors="pt"
).input_features

input_features = input_features.to("cuda")

# 추론
with torch.no_grad():
    predicted_ids = model.generate(
        input_features,
        language="ko",           # 한국어 명시
        task="transcribe",       # transcribe 작업 명시
        return_timestamps=False  # 타임스탬프 불필요 시
    )

# 디코딩
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
print(transcription)

서울특별시서초구테헤란로941 현대아파트553동3722호
