### 학습 모델 Inference

In [1]:
import os
import torch
from dataclasses import dataclass
import pandas as pd
from transformers import (
    WhisperForConditionalGeneration, WhisperProcessor
)
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

In [10]:
model_id = "openai/whisper-tiny"
checkpoint_dir = f"/workspace/results/whisper_train/{model_id.split('/')[-1]}/checkpoint-10000"
lang = "ko"
task = "transcribe"

test_df = pd.read_csv("/workspace/kru_data/test.csv")
sample_file = test_df.iloc[0]["abs_path"]
# sample_file = "/workspace/kor_med_stt_data/snu_data/말이좀늦는것같아요_샘플데이터셋/남학생의사_여환자_1.wav"

In [11]:
import librosa
waveform, sr = librosa.load(sample_file, sr=16000)
# 처음 5초는 넘어가고 5~35초 가지고오기
start_sec = 0
end_sec = 3
start_sample = start_sec * sr
end_sample = end_sec * sr
if len(waveform) > end_sample:
    print(f"This file length is {len(waveform)/sr} seconds. Using audio from {start_sec} to {end_sec} seconds.")
    waveform = waveform[start_sample:end_sample]
else:
    print(f"This file length is {len(waveform)/sr} seconds. Using available audio from {start_sec} seconds to end.")
    waveform = waveform[start_sample:]
    
# sampling된 오디오 jupyter 에서 듣기
import IPython.display as ipd
ipd.Audio(waveform, rate=sr)

This file length is 2.76 seconds. Using available audio from 0 seconds to end.


In [12]:
processor = WhisperProcessor.from_pretrained(
    model_id, language="ko", task="transcribe"
)
# model = WhisperForConditionalGeneration.from_pretrained(checkpoint_dir).to("cuda")
model = WhisperForConditionalGeneration.from_pretrained(model_id).to("cuda")

In [13]:
def transcribe_long_audio(processor, model, waveform, sr, chunk_sec=5):
    chunk_size = chunk_sec * sr
    texts = []
    last_text = ""

    for start in range(0, len(waveform), chunk_size):
        end = start + chunk_size
        chunk = waveform[start:end]
        if len(chunk) == 0:
            continue

        # Whisper inference
        input_features = processor.feature_extractor(
            chunk,
            sampling_rate=sr,
            return_tensors="pt"
        ).input_features.to("cuda")

        with torch.no_grad():
            pred_ids = model.generate(input_features)

        text = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)[0]

        # 🔥 중복 제거
        if last_text:
            # 마지막 15자 기준으로 중복 검사
            overlap = last_text[-15:]
            if text.startswith(overlap):
                text = text[len(overlap):]

        texts.append(text)
        last_text = text

    return " ".join(texts).strip()

full_text = transcribe_long_audio(
    processor,
    model,
    waveform,
    sr,
    chunk_sec=3,     # 여기서 원하는 길이 설정
)

print(full_text)

식사량이 줄어들진 않으셨어요?


## 모델 평가

In [1]:
from utils.eval import fast_asr_metrics
import json 
import pandas as pd

# omniasr inference model
# file_path = "/workspace/results/omniasr_inference/omniasr_ctc/omniASR_CTC_300M/test_pred.parquet" # omni asr 

# whisper train model 
file_path = "/workspace/results/whisper_test/whisper-tiny/checkpoint-10000/test_pred.parquet" # whisper train model
# whisper inference model
# file_path = "/workspace/results/whisper_inference/whisper_tiny_inference/test_pred.parquet" # whisper inference model

save_path = file_path.replace(file_path.split("/")[-1], "metrics.json") # file_path 의 디렉토리
df = pd.read_parquet(file_path)

metrics = fast_asr_metrics(df, gt_col = "gt_text", pred_col = "pred_text")
print(metrics)

with open(save_path, "w") as f:
    json.dump(metrics, f, indent=4)
    print(f"Metrics saved to {save_path}")

{'wer_all': 22.878849434110133, 'cer_all': 11.373441863799446, 'wer_mean': 24.92755924608923, 'wer_std': 177.85442264355652, 'wer_median': 11.11111111111111, 'wer_min': 0.0, 'wer_max': 33300.0, 'cer_mean': 13.042271626140392, 'cer_std': 234.5897695138903, 'cer_median': 2.941176470588235, 'cer_min': 0.0, 'cer_max': 44112.5, 'num_samples': 254490}
Metrics saved to /workspace/results/whisper_test/whisper-tiny/checkpoint-10000/metrics.json


### 전체 평가

In [2]:
from utils.eval import fast_asr_metrics
import json 
import pandas as pd
from glob import glob 
test_root_dir = "/workspace/results/whisper_inference"

test_files = glob(f"{test_root_dir}/**/test_pred.parquet")
for file_path in test_files:
    save_path = file_path.replace(file_path.split("/")[-1], "metrics.json") # file_path 의 디렉토리
    df = pd.read_parquet(file_path)

    metrics = fast_asr_metrics(df, gt_col = "gt_text", pred_col = "pred_text")
    print(file_path.split("/")[-2])
    print(metrics)

    with open(save_path, "w") as f:
        json.dump(metrics, f, indent=4)
        print(f"Metrics saved to {save_path}")

whisper_small_inference
{'wer_all': 30.65465481088408, 'cer_all': 11.45777287471267, 'wer_mean': 33.58536805174418, 'wer_std': 79.75147934013405, 'wer_median': 26.666666666666668, 'wer_min': 0.0, 'wer_max': 22200.0, 'cer_mean': 12.950781327274285, 'cer_std': 86.1993945254448, 'cer_median': 7.6923076923076925, 'cer_min': 0.0, 'cer_max': 28839.999999999996, 'num_samples': 253683}
Metrics saved to /workspace/results/whisper_inference/whisper_small_inference/metrics.json
whisper_medium_inference
{'wer_all': 27.274896388566734, 'cer_all': 8.805130142128222, 'wer_mean': 29.415768802792613, 'wer_std': 75.25403204091486, 'wer_median': 25.0, 'wer_min': 0.0, 'wer_max': 22200.0, 'cer_mean': 9.85910903259627, 'cer_std': 63.855026635032644, 'cer_median': 5.88235294117647, 'cer_min': 0.0, 'cer_max': 18483.333333333336, 'num_samples': 253535}
Metrics saved to /workspace/results/whisper_inference/whisper_medium_inference/metrics.json
whisper_tiny_inference
{'wer_all': 59.88418236867633, 'cer_all': 33.