In [78]:
import numpy as np
import soundfile as sf
import librosa
from scipy.stats import pearsonr
import parselmouth

def load_wav(path, target_sr=None, mono=True):
    """WAV 파일을 로드하고 필요시 리샘플링합니다."""
    y, sr = sf.read(path)
    if y.ndim > 1 and mono:
        y = np.mean(y, axis=1)
    if (target_sr is not None) and (sr != target_sr):
        y = librosa.resample(y.astype(np.float32), orig_sr=sr, target_sr=target_sr)
        sr = target_sr
    return y.astype(np.float32), sr

def _extract_f0_parselmouth(y, sr, hop_ms=10, fmin=50, fmax=600):
    """Praat(Parselmouth)로 F0 추출. 무성 구간은 np.nan."""
    snd = parselmouth.Sound(y, sampling_frequency=sr)
    time_step = hop_ms / 1000.0
    pitch = parselmouth.praat.call(
        snd, "To Pitch", time_step, fmin,  # time step, min pitch
        0.03, "cc", 0.25, 0.01, fmax      # 기타 디폴트 파라미터 + max pitch
    )
    n_frames = pitch.get_number_of_frames()
    times = np.array([pitch.get_time_from_frame_number(i+1) for i in range(n_frames)], dtype=np.float32)
    f0 = np.array([pitch.get_value_in_frame(i+1) for i in range(n_frames)], dtype=np.float32)
    # 무성(unvoiced)은 <= 0으로 반환될 수 있으므로 nan으로 치환
    f0[f0 <= 0] = np.nan
    return times, f0


def _extract_f0_pyinn(y, sr, hop_ms=10, fmin=50, fmax=600):
    """librosa.pyin으로 F0 추출. 무성 구간은 np.nan."""
    frame_length = 2048
    hop_length = max(1, int(sr * (hop_ms / 1000.0)))
    f0, _, _ = librosa.pyin(
        y, fmin=fmin, fmax=fmax, sr=sr,
        frame_length=frame_length, hop_length=hop_length
    )
    times = librosa.frames_to_time(np.arange(len(f0)), sr=sr, hop_length=hop_length)
    return times.astype(np.float32), f0.astype(np.float32)

def extract_f0(y, sr, hop_ms=10, fmin=50, fmax=600):
    """가능하면 Praat, 아니면 pyin으로 F0를 추출."""
    if _HAS_PARSELMOUTH:
        return _extract_f0_parselmouth(y, sr, hop_ms=hop_ms, fmin=fmin, fmax=fmax)
    else:
        return _extract_f0_pyinn(y, sr, hop_ms=hop_ms, fmin=fmin, fmax=fmax)

def _nearest_frame_values(times_src, vals_src, times_tgt):
    """
    src의 시간축(times_src)에서 tgt 시간축(times_tgt)에 가장 가까운 프레임 값을 뽑는다.
    (보간 대신 최근접 매핑 사용; 원본의 무성은 그대로 nan 유지)
    """
    idxs = np.searchsorted(times_src, times_tgt, side="left")
    idxs = np.clip(idxs, 0, len(times_src)-1)
    # 좌우 중 더 가까운 쪽으로 한 번 더 보정
    left = np.clip(idxs - 1, 0, len(times_src)-1)
    right = idxs
    choose_left = np.abs(times_src[left] - times_tgt) <= np.abs(times_src[right] - times_tgt)
    nearest = np.where(choose_left, left, right)
    return vals_src[nearest]


def _make_common_time_grid(t1, t2, hop_ms=10):
    """두 시퀀스의 공통 시간 격자를 만든다(겹치는 구간만)."""
    start = max(float(np.min(t1)), float(np.min(t2)))
    end = min(float(np.max(t1)), float(np.max(t2)))
    if end <= start:
        return np.array([], dtype=np.float32)
    step = hop_ms / 1000.0
    # 끝점을 포함하도록 약간의 여유
    grid = np.arange(start, end + 1e-6, step, dtype=np.float32)
    return grid


def compute_f0_corr(
    y_ref, sr_ref, y_gen, sr_gen,
    hop_ms=10, fmin=50, fmax=600,
    log_scale=True
):
    """
    레퍼런스/생성 음성의 F0 상관계수를 계산.
    - Praat(또는 pyin)으로 F0 추출
    - 공통 시간 격자에 최근접 매핑
    - 양쪽 모두 유성(비-nan) 프레임만 Pearson r
    반환값: dict(pearson_r, n_frames_used, voiced_rate_ref, voiced_rate_gen, hop_ms, fmin, fmax, log_scale)
    """
    # 1) F0 추출
    t_ref, f0_ref = extract_f0(y_ref, sr_ref, hop_ms=hop_ms, fmin=fmin, fmax=fmax)
    t_gen, f0_gen = extract_f0(y_gen, sr_gen, hop_ms=hop_ms, fmin=fmin, fmax=fmax)

    if len(t_ref) < 2 or len(t_gen) < 2:
        return {
            "pearson_r": np.nan,
            "n_frames_used": 0,
            "voiced_rate_ref": 0.0,
            "voiced_rate_gen": 0.0,
            "hop_ms": hop_ms, "fmin": fmin, "fmax": fmax, "log_scale": log_scale
        }

    # 2) 공통 시간 격자 생성
    grid = _make_common_time_grid(t_ref, t_gen, hop_ms=hop_ms)
    if len(grid) < 3:
        return {
            "pearson_r": np.nan,
            "n_frames_used": 0,
            "voiced_rate_ref": float(np.mean(np.isfinite(f0_ref))),
            "voiced_rate_gen": float(np.mean(np.isfinite(f0_gen))),
            "hop_ms": hop_ms, "fmin": fmin, "fmax": fmax, "log_scale": log_scale
        }

    # 3) 최근접 매핑으로 각각의 격자 F0 획득
    f0g_ref = _nearest_frame_values(t_ref, f0_ref, grid)
    f0g_gen = _nearest_frame_values(t_gen, f0_gen, grid)

    # 4) 유성(둘 다 유효) 마스크
    mask = np.isfinite(f0g_ref) & np.isfinite(f0g_gen)

    if np.count_nonzero(mask) < 3:
        return {
            "pearson_r": np.nan,
            "n_frames_used": int(np.count_nonzero(mask)),
            "voiced_rate_ref": float(np.mean(np.isfinite(f0_ref))),
            "voiced_rate_gen": float(np.mean(np.isfinite(f0_gen))),
            "hop_ms": hop_ms, "fmin": fmin, "fmax": fmax, "log_scale": log_scale
        }

    x = f0g_ref[mask]
    y = f0g_gen[mask]

    if log_scale:
        # log-F0로 상관 (음높이 지각과 더 잘 일치)
        x = np.log(x)
        y = np.log(y)

    r, _ = pearsonr(x, y)

    return {
        "pearson_r": float(r),
        "n_frames_used": int(len(x)),
        "voiced_rate_ref": float(np.mean(np.isfinite(f0_ref))),
        "voiced_rate_gen": float(np.mean(np.isfinite(f0_gen))),
        "hop_ms": hop_ms, "fmin": fmin, "fmax": fmax, "log_scale": log_scale
    }


In [None]:
def main():
    """
    주피터 셀에서 바로 실행 가능한 데모.
    - REF_PATH: 레퍼런스(정답) WAV
    - GEN_PATH: 생성(모델 출력) WAV
    """
    # >>> 여기만 여러분 환경에 맞게 수정하세요 <<<
    REF_PATH = "data/LJ001-0004.wav"
    GEN_PATH = "eval_output/LJ001-0004-gen.wav"

    y_ref, sr_ref = load_wav(REF_PATH)
    y_gen, sr_gen = load_wav(GEN_PATH)

    # F0 corr 계산
    metrics = compute_f0_corr(
        y_ref, sr_ref, y_gen, sr_gen,
        hop_ms=10, fmin=50, fmax=600, log_scale=True
    )

    # 보기 좋게 출력
    print("=== F0 Correlation Result ===")
    print(f"Pearson r (log-F0): {metrics['pearson_r']:.4f}" if np.isfinite(metrics['pearson_r']) else "Pearson r: NaN")
    print(f"Frames used (voiced&voiced): {metrics['n_frames_used']}")
    print(f"Voiced rate (ref): {metrics['voiced_rate_ref']*100:.1f}%")
    print(f"Voiced rate (gen): {metrics['voiced_rate_gen']*100:.1f}%")
    print(f"Params: hop={metrics['hop_ms']}ms, f0_range=[{metrics['fmin']},{metrics['fmax']}], log_scale={metrics['log_scale']}")


# 주피터 셀에서 바로 실행되도록
if __name__ == "__main__":
    main()

In [2]:
import torch
from transformers import CsmForConditionalGeneration, AutoProcessor

model_id = "sesame/csm-1b"
device = "cuda" if torch.cuda.is_available() else "cpu"

processor = AutoProcessor.from_pretrained(model_id)
model = CsmForConditionalGeneration.from_pretrained(model_id, device_map=device)

text = "[0]Hello from Sesame." 
inputs = processor(text, add_special_tokens=True).to(device)

conversation = [
    {"role": "0", "content": [{"type": "text", "text": "hello sesame"}]},
]
inputs = processor.apply_chat_template(
    conversation,
    tokenize=True,
    return_dict=True,
).to(device)

# infer the model
audio = model.generate(**inputs, output_audio=True)
processor.save_audio(audio, "example_without_context.wav")

Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:01<00:00,  1.33it/s]


In [4]:
!git add sesame-csm1b

fatal: pathspec 'sesame-csm1b' did not match any files


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
