## VAD

In [None]:
import os
import torch
import numpy as np
import librosa
import soundfile as sf
import IPython.display as ipd

# 환경 설정
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
torch.set_num_threads(10)
from utils.VAD import SieroVAD

audio_path = "/workspace/kor_med_stt_data/snu_data/말이좀늦는것같아요_샘플데이터셋/남학생의사_여환자_1.wav"
sr = 16000
vad_model = SieroVAD(
    sampling_rate=sr, 
    device="cpu", 
    threshold=0.7, 
    min_speech_ms=250, #  
    min_silence_ms=100, 
    pad_ms=30, 
    max_speech_s=float("inf")) # Initialize VAD Model

In [None]:
vad_result = vad_model.run(audio_path=audio_path)

In [None]:
for i in range(len(vad_result['segment_array'])):
    print(f"Segment {i} : {vad_result['timestamp'][i]}")
    display(ipd.Audio(vad_result['segment_array'][i], rate=vad_model.sampling_rate))

    
    if i == 11:
        break 

In [None]:
# 임의의 segment(15번째) 출력 및 재생
segment_idx = 11
if len(vad_result['segment_array']) > segment_idx:
    segment = vad_result['segment_array'][segment_idx]
    timestamp = vad_result['timestamp'][segment_idx]
    print(f"[VAD] Showing segment {segment_idx} | Length: {len(segment)} samples | Timestamp: {timestamp}")
    display(ipd.Audio(segment, rate=vad_model.sampling_rate))
else:
    print(f"[Warning] segment_array의 길이가 {segment_idx+1}보다 짧습니다.")

In [None]:
import matplotlib.pyplot as plt
import librosa
import librosa.display
from utils.plot import plot_melspectrogram

# whisper 모델 로드
from utils.whisper import WhisperInference
whisper_infer = WhisperInference(    
    # model_dir = "openai/whisper-small",
    model_dir="/workspace/kor_med_stt_data/results/whisper_train/whisper-small/checkpoint-254532", # Training Model
    # model_dir="/workspace/kor_med_stt_data/results/whisper_train/whisper-base/checkpoint-5090605",
    device="cuda" if torch.cuda.is_available() else "cpu"
)

In [None]:
display(f"Transcription : {whisper_infer.transcribe(audio_array = segment)}")
display(ipd.Audio(segment, rate=sr))
plot_melspectrogram(segment, sr) 

## NoiseReducer

In [None]:
import noisereduce as nr

# segment은 이미 메모리에 numpy 배열로 로드되어 있으므로 별도의 wav 파일 IO는 필요 없음
# segment는 (길이,) 형태의 float32 및 sr 샘플레이트
def match_target_rms(y, target_rms=0.1):
    rms = np.sqrt(np.mean(y**2))
    scalar = target_rms / (rms + 1e-8)
    return y * scalar
# reduced_noise = nr.reduce_noise(y=match_target_rms(segment, target_rms=0.1), sr=sr, prop_decrease=1.0)

reduced_noise = nr.reduce_noise(y=segment, sr=sr, prop_decrease=1.0)
display(f"Transcription : {whisper_infer.transcribe(audio_array = reduced_noise)}")
display(ipd.Audio(reduced_noise, rate=sr))
plot_melspectrogram(reduced_noise, sr)

## Normalize

In [None]:
import numpy as np
import librosa
import IPython.display as ipd

# peak normalize
def normalize_audio(y, peak=0.99):
    return y / np.max(np.abs(y)) * peak

reduced_norm = normalize_audio(reduced_noise)

display(f"Transcription : {whisper_infer.transcribe(audio_array = reduced_norm)}")
display(ipd.Audio(reduced_norm, rate=sr))
plot_melspectrogram(reduced_norm, sr)

In [None]:
def match_target_rms(y, target_rms=0.1):
    rms = np.sqrt(np.mean(y**2))
    scalar = target_rms / (rms + 1e-8)
    return y * scalar

reduced_norm = match_target_rms(reduced_noise, target_rms=0.1)

display(f"Transcription : {whisper_infer.transcribe(audio_array = reduced_norm)}")
display(ipd.Audio(reduced_norm, rate=sr))
plot_melspectrogram(reduced_norm, sr)